| Title: | Estimate Incidence, Proportions and Exponential Growth Rates |
|---|---|
| Description: | Simple statistical models and visualisations for calculating the incidence, proportion, exponential growth rate, and reproduction number of infectious disease case time series. This toolkit was largely developed during the COVID-19 pandemic. |
| Authors: | Robert Challen [aut, cre, cph] (ORCID: <https://orcid.org/0000-0002-5504-7768>) |
| Maintainer: | Robert Challen <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 0.6.1 |
| Built: | 2026-05-10 08:42:48 UTC |
| Source: | https://github.com/ai4ci/ggoutbreak |
Time periods are just a zero based numeric representation of dates with a time unit baked in. This allows variable length periods (e.g. days or weeks), and fractional days to be represented in a consistent(ish) way between things that want to deal in dates (like ggplot) and things that want to deal in numbers (like model fitting)
as.time_period(x, ...) ## S3 method for class 'time_period' as.time_period(x, unit = NULL, start_date = NULL, ...) ## S3 method for class 'Date' as.time_period(x, unit = NULL, anchor = NULL, ...) ## S3 method for class 'numeric' as.time_period(x, unit = NULL, start_date = NULL, ...) ## S3 method for class 'grates_epiweek' as.time_period(x, ...) ## S3 method for class 'grates_isoweek' as.time_period(x, ...) ## S3 method for class 'grates_period' as.time_period(x, ...) ## S3 method for class 'time_period' seq(from, to = from, ...) is.time_period(x) date_to_time(dates, unit = NULL, start_date = NULL) time_to_date( timepoints, unit = attr(timepoints, "unit"), start_date = attr(timepoints, "start_date") )as.time_period(x, ...) ## S3 method for class 'time_period' as.time_period(x, unit = NULL, start_date = NULL, ...) ## S3 method for class 'Date' as.time_period(x, unit = NULL, anchor = NULL, ...) ## S3 method for class 'numeric' as.time_period(x, unit = NULL, start_date = NULL, ...) ## S3 method for class 'grates_epiweek' as.time_period(x, ...) ## S3 method for class 'grates_isoweek' as.time_period(x, ...) ## S3 method for class 'grates_period' as.time_period(x, ...) ## S3 method for class 'time_period' seq(from, to = from, ...) is.time_period(x) date_to_time(dates, unit = NULL, start_date = NULL) time_to_date( timepoints, unit = attr(timepoints, "unit"), start_date = attr(timepoints, "start_date") )
x |
a vector of dates, numbers (may be integer or real) or a |
... |
arguments passed to or from methods. |
unit |
the length of one unit of time. This will be either a integer
number of days, or a specification such as "1 week", or another |
start_date |
the zero time date as something that can be coerced to a
date. If the |
anchor |
only relevant if |
from, to
|
the starting and (maximal) end values of the
sequence. Of length |
dates |
a vector of dates to convert to a |
timepoints |
a |
a vector of class time_period
seq(time_period): Create a sequence using time_periods
is.time_period(): Check is a time_period
date_to_time(): Convert a set of dates to numeric timepoints
time_to_date(): Convert a set of time points to dates
tmp = as.time_period(grates::as_epiweek("2019-W12", format = "yearweek")+0:2)
testthat::expect_equal(
lubridate::epiweek(as.Date(tmp)),
c(12, 13, 14)
)
tmp = as.time_period(grates::as_isoweek("2019-W12", format = "yearweek")+0:2)
testthat::expect_equal(
lubridate::isoweek(as.Date(tmp)),
c(12, 13, 14)
)
x = as.time_period(as.Date("2025-01-01")+0:2, anchor="start", unit="1 day")
y = as.time_period(as.Date("2025-01-07")+0:2, anchor="start", unit="1 day")
testthat::expect_equal(as.numeric(c(x, y)), c(0, 1, 2, 6, 7, 8))
testthat::expect_equal(as.numeric(c(y, x)), c(0, 1, 2, -6, -5, -4))
z = as.time_period(as.Date("2025-01-01")+0:2*7, anchor="start", unit="1 week")
testthat::expect_equal(as.numeric(c(x, z)), c(0, 1, 2, 0, 7, 14))
testthat::expect_equal(
as.numeric(c(z, x)),
c(0, 1, 2, 0, 0.142857142857143, 0.285714285714286)
)
testthat::expect_equal(
as.Date(c(y, z)),
structure(c(20095, 20096, 20097, 20089, 20096, 20103), class = "Date")
)
seq(x[[1]], y[[1]])
#' # 100 weeks from 2020-01-01 tmp = as.time_period(0:100, 7, "2020-01-01") as.Date(tmp) range(tmp) min(tmp) tmp2 = as.integer(as.Date(tmp)) # testthat::expect_true(all(na.omit(tmp2-lag(tmp2)) == 7)) tmp2 = as.time_period(0:23, 1/24, "2020-01-01") as.POSIXct(tmp2) # convert timeseries to new "unit" tmp = as.time_period(0:100, 7, "2020-01-01") tmp2 = as.time_period(tmp,1) testthat::expect_equal(as.numeric(tmp2), 0:100*7) # 100 weeks from 2020-01-01 tmp = as.time_period(0:100, 7, "2020-01-01") as.Date(tmp) range(tmp) min(tmp) tmp2 = as.integer(as.Date(tmp)) # testthat::expect_true(all(na.omit(tmp2-lag(tmp2)) == 7)) tmp2 = as.time_period(0:23, 1/24, "2020-01-01") as.POSIXct(tmp2) # convert timeseries to new "unit" tmp = as.time_period(0:100, 7, "2020-01-01") tmp2 = as.time_period(tmp,1) testthat::expect_equal(as.numeric(tmp2), 0:100*7) # Time to date times = date_to_time(as.Date("2019-12-29")+0:100, "1 week") dates = time_to_date(times) # Date to time times = date_to_time(as.Date("2019-12-29")+0:100, "1 week") dates = time_to_date(times)#' # 100 weeks from 2020-01-01 tmp = as.time_period(0:100, 7, "2020-01-01") as.Date(tmp) range(tmp) min(tmp) tmp2 = as.integer(as.Date(tmp)) # testthat::expect_true(all(na.omit(tmp2-lag(tmp2)) == 7)) tmp2 = as.time_period(0:23, 1/24, "2020-01-01") as.POSIXct(tmp2) # convert timeseries to new "unit" tmp = as.time_period(0:100, 7, "2020-01-01") tmp2 = as.time_period(tmp,1) testthat::expect_equal(as.numeric(tmp2), 0:100*7) # 100 weeks from 2020-01-01 tmp = as.time_period(0:100, 7, "2020-01-01") as.Date(tmp) range(tmp) min(tmp) tmp2 = as.integer(as.Date(tmp)) # testthat::expect_true(all(na.omit(tmp2-lag(tmp2)) == 7)) tmp2 = as.time_period(0:23, 1/24, "2020-01-01") as.POSIXct(tmp2) # convert timeseries to new "unit" tmp = as.time_period(0:100, 7, "2020-01-01") tmp2 = as.time_period(tmp,1) testthat::expect_equal(as.numeric(tmp2), 0:100*7) # Time to date times = date_to_time(as.Date("2019-12-29")+0:100, "1 week") dates = time_to_date(times) # Date to time times = date_to_time(as.Date("2019-12-29")+0:100, "1 week") dates = time_to_date(times)
A scales breaks generator for log1p scales
breaks_log1p(n = 5, base = 10)breaks_log1p(n = 5, base = 10)
n |
the number of breaks |
base |
the base for the breaks |
a function for ggplot scale breaks
ggplot2::ggplot(ggplot2::diamonds, ggplot2::aes(x=price))+ ggplot2::geom_density()+ ggplot2::scale_x_continuous(trans="log1p", breaks=breaks_log1p())ggplot2::ggplot(ggplot2::diamonds, ggplot2::aes(x=price))+ ggplot2::geom_density()+ ggplot2::scale_x_continuous(trans="log1p", breaks=breaks_log1p())
Generate a random probability based on features of the simulation
cfg_beta_prob_rng(probability_fn = ~0.8, kappa_fn = ~0.1)cfg_beta_prob_rng(probability_fn = ~0.8, kappa_fn = ~0.1)
probability_fn |
a function which gives the time-varying mean of a beta
distribution, The function will be called minimally with |
kappa_fn |
a function which gives the time-varying dispersion of a beta
distribution. The function will be called with minimally |
a time dependent function that inputs a time (as a time_period) and
returns an probability for each day defined by the probability_fn and kappa_fn
fn = cfg_beta_prob_rng(~ ifelse(.x<=5,0.1,0.9)) fn(1:10)fn = cfg_beta_prob_rng(~ ifelse(.x<=5,0.1,0.9)) fn(1:10)
Get a IP generating function from time varying mean and SD of a gamma function
cfg_gamma_ip_fn(mean_fn = ~2, sd_fn = function(mean) sqrt(mean))cfg_gamma_ip_fn(mean_fn = ~2, sd_fn = function(mean) sqrt(mean))
mean_fn |
a function which gives the time-varying mean of a gamma
distribution, The function will be called minimally with |
sd_fn |
a function which gives the time-varying mean of a gamma
distribution. The function will be called with minimally |
a time dependent function that inputs a time (as a time_period) and
returns an ip delay distribution for each day defined by the mean_fn and sd_fn
fn = cfg_gamma_ip_fn(mean_fn = function(t) ifelse(t < 5, 4, 2)) # a gamma function that changes mean at time 5 fn(4) fn(7)fn = cfg_gamma_ip_fn(mean_fn = function(t) ifelse(t < 5, 4, 2)) # a gamma function that changes mean at time 5 fn(4) fn(7)
This is used for random sampling from the infectivity profile for times to infection, for example. There is nothing to stop you putting in a delay distribution with negative times but strange things may happen in your simulation.
cfg_ip_sampler_rng(ip = i_empirical_ip)cfg_ip_sampler_rng(ip = i_empirical_ip)
ip |
a long format empirical distribution - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
a function which accepts n parameter which produces random samples
from the ip distribution
tmp = cfg_ip_sampler_rng(example_ganyani_ip())(10000) # This discretised ganyani distribution is based on these figures: # mean: 5.2 (3.78-6.78) and sd: 1.72 (0.91-3.93) format_ip(example_ganyani_ip()) mean(tmp) # Should be about 5.2 stats::sd(tmp) # Should be about 1.72tmp = cfg_ip_sampler_rng(example_ganyani_ip())(10000) # This discretised ganyani distribution is based on these figures: # mean: 5.2 (3.78-6.78) and sd: 1.72 (0.91-3.93) format_ip(example_ganyani_ip()) mean(tmp) # Should be about 5.2 stats::sd(tmp) # Should be about 1.72
Linear function from dataframe
cfg_linear_fn(changes, ..., col_name = setdiff(colnames(changes), "t"))cfg_linear_fn(changes, ..., col_name = setdiff(colnames(changes), "t"))
changes |
a dataframe with |
... |
not used |
col_name |
the value column (optional if only 2 columns) |
a function that inputs a vector t and returns a linearly
interpolated value from <col_name>
Step function from dataframe
cfg_step_fn(changes, ..., col_name = setdiff(colnames(changes), "t"))cfg_step_fn(changes, ..., col_name = setdiff(colnames(changes), "t"))
changes |
a dataframe with |
... |
not used |
col_name |
the value column (optional if only 2 columns) |
a function that inputs a vector t and returns the next smallest
corresponding value in <col_name> (or the first one)
This is particularly designed for use within the fn_list_next_gen parameter
of sim_branching_process() to allow age group contract matrices to be applied
for example (assuming a categorical age).
cfg_transition_fn(transition)cfg_transition_fn(transition)
transition |
a transition matrix or long format dataframe. If a matrix the columns should add to 1, and the column names are the input class. row names are the output class. the data frame format must have input, output, and probability columns. |
a function that given an input will return samples of the output class according to the probability distributions.
age = rep(c("child","adult","elderly"),100) fn = cfg_transition_fn(dplyr::tribble( ~input, ~output, ~probability, "child", "child", 0.5, "child", "adult", 0.5, "adult", "child", 0.5, "adult", "adult", 0.3, "adult", "elderly", 0.2, "elderly","elderly", 0.5, "elderly","adult", 0.5, )) table(fn(age),age)age = rep(c("child","adult","elderly"),100) fn = cfg_transition_fn(dplyr::tribble( ~input, ~output, ~probability, "child", "child", 0.5, "child", "adult", 0.5, "adult", "child", 0.5, "adult", "adult", 0.3, "adult", "elderly", 0.2, "elderly","elderly", 0.5, "elderly","adult", 0.5, )) table(fn(age),age)
This function returns a random number generator from a gamma distribution that has a weekly period and configurable degree of additional variability around weekly pattern. It is suited to delays of things like testing which may depend on the day of week.
cfg_weekly_gamma_rng( mean = c(1, 1, 1, 1, 4, 3, 2), sd = sqrt(mean), week_starts = weekdays(as.Date("2024-10-14")) )cfg_weekly_gamma_rng( mean = c(1, 1, 1, 1, 4, 3, 2), sd = sqrt(mean), week_starts = weekdays(as.Date("2024-10-14")) )
mean |
a mean amount of delay for each day of the week |
sd |
the SD of the delay |
week_starts |
locale description of first day of week (default is a "Monday"). |
a random number generator taking t time parameter and returning a
duration the that time t depending on the day of the week.
fn = cfg_weekly_gamma_rng(c(1,1,1,1,4,3,2)) matrix(fn(1:42),ncol=7,byrow=TRUE)fn = cfg_weekly_gamma_rng(c(1,1,1,1,4,3,2)) matrix(fn(1:42),ncol=7,byrow=TRUE)
Weekly convolution distribution function
cfg_weekly_ip_fn( mean = c(1, 1, 1, 1, 4, 3, 2), sd = sqrt(mean), week_starts = weekdays(as.Date("2024-10-14")) )cfg_weekly_ip_fn( mean = c(1, 1, 1, 1, 4, 3, 2), sd = sqrt(mean), week_starts = weekdays(as.Date("2024-10-14")) )
mean |
The means of a gamma distributed delay function for each weekday |
sd |
The sds of a gamma distributed delay function for each weekday |
week_starts |
locale description of first day of week (default is a "Monday"). |
a time dependent function that inputs a time (as a time_period) and generates an IP delay distribution for each day varying by day of week
cat(sapply(cfg_weekly_ip_fn()(1:7),format_ip),sep = "\n")cat(sapply(cfg_weekly_ip_fn()(1:7),format_ip),sep = "\n")
This function returns a random probability generator that has a weekly period and configurable degree of additional variability around weekly pattern. It is suited to probabilities of things like testing which may depend on the day of week.
cfg_weekly_proportion_rng( prob = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.5, 0.5), kappa = 0.1, week_starts = weekdays(as.Date("2024-10-14")) )cfg_weekly_proportion_rng( prob = c(0.8, 0.8, 0.8, 0.8, 0.8, 0.5, 0.5), kappa = 0.1, week_starts = weekdays(as.Date("2024-10-14")) )
prob |
the rates of e.g. ascertainment for each day of the week. |
kappa |
dispersion parameter between 0 and 1. O is no dispersion. 1 is maximum |
week_starts |
locale description of first day of week (default is a "Monday"). |
a random number generator function taking t time parameter and
returning a probability of ascertainment for that time, depending on day of
week etc.
fn = cfg_weekly_proportion_rng(c(0.9,0.9,0.9,0.9,0.9,0.1,0.1)) matrix(fn(1:42),ncol=7,byrow=TRUE)fn = cfg_weekly_proportion_rng(c(0.9,0.9,0.9,0.9,0.9,0.1,0.1)) matrix(fn(1:42),ncol=7,byrow=TRUE)
The counterpart to date_seq_dates(). Take an original set of data and place
it within a regular time series where the periodicity of the time series may
be expressed as numbers of days, weeks, months quarters, or years, and the
periods are defined by an anchoring date, day of the week or by reference to
the start or end of the input dates. This can either return the periods as
dates or factors (e.g. for plotting) or as a time_period for analysis that
relies on a numeric representation of the date or duration from the anchor.
cut_date( dates, unit, anchor = "start", output = c("date", "factor", "time_period"), dfmt = "%d/%b/%y", ifmt = "{start} — {end}", ... )cut_date( dates, unit, anchor = "start", output = c("date", "factor", "time_period"), dfmt = "%d/%b/%y", ifmt = "{start} — {end}", ... )
dates |
a set of dates |
unit |
a period e.g. "1 week" |
anchor |
one of a date, "start" or "end" or a weekday name e.g. "mon" this will always be one of the start of the time periods we are cutting into |
output |
return the result as either a "date" (the default), an ordered "factor" with the date ranges as a label, or as a "time_period". The result is named with labels referring to the |
dfmt |
the |
ifmt |
a |
... |
ignored |
a set of dates, times or a factor level, representing the start of the period the date falls into, where the period is defined by the duration and the anchor
dates = as.Date(c("2020-01-01","2020-02-01","2020-01-15","2020-02-03",NA)) fs = date_seq(dates, "2 days") dates - cut_date(dates, "2 days") cut_date(dates,unit="2 days", output="time_period") # A weekly set of dates: dates2 = Sys.Date() + floor(stats::runif(50,max=10))*7 # in this specific situation the final date is not truncated because the # input data is seen as an exact match for the whole output period. cut_date(dates2, "1 week", "sun", output="factor") cut_date(dates2, dfmt = "%d/%b", output="factor", unit = "2 weeks", anchor="sun")dates = as.Date(c("2020-01-01","2020-02-01","2020-01-15","2020-02-03",NA)) fs = date_seq(dates, "2 days") dates - cut_date(dates, "2 days") cut_date(dates,unit="2 days", output="time_period") # A weekly set of dates: dates2 = Sys.Date() + floor(stats::runif(50,max=10))*7 # in this specific situation the final date is not truncated because the # input data is seen as an exact match for the whole output period. cut_date(dates2, "1 week", "sun", output="factor") cut_date(dates2, dfmt = "%d/%b", output="factor", unit = "2 weeks", anchor="sun")
This is useful if you want to fill in missing values that should have been observed but weren't. For example, date_seq(c(1, 2, 4, 6), 1) will return 1:6.
date_seq(x, period, ...)date_seq(x, period, ...)
x |
a numeric or date vector |
period |
Gap between each observation. The existing data will be checked to ensure that it is actually of this periodicity. |
... |
for subtype methods |
a vector of the same type as the input
date_seq(c(1, 2, 4, 5, 10), 1)date_seq(c(1, 2, 4, 5, 10), 1)
Derive from a vector of observation dates, a complete ordered sequence of
periods in a regular time series, where the length of the periods is
specified, as a number of days, weeks, years etcetera. E.g. this can convert a
random set of dates to a ordered complete list of 1 week intervals (or 2
month intervals) spanning the same range as the dates. This has some
interesting problems regarding where to put breaks within a month or week.
Often this is either based on a specific date (e.g. yearly periods starting
at 2020-01-01) or a day of week (e.g. 2 weekly periods staring on a Sunday)
or maybe relative to the input time series (weekly ending on the last date of
the data). There is also a problem when we consider data that may have
incomplete starting and end periods, which may not be comparable to other
periods, and we may need to exclude these from the result.
## S3 method for class 'Date' date_seq(x, period = .day_interval(x), anchor = "start", complete = FALSE, ...)## S3 method for class 'Date' date_seq(x, period = .day_interval(x), anchor = "start", complete = FALSE, ...)
x |
a vector of dates, possibly including NA values |
period |
the gap between observations as a number of days or as a natural language definition of the period such as "1 week", '2 weeks', '1 month', etcetera. If not given this will be derived from the dates. |
anchor |
defines a day that appears in the sequence (if it were to extend that far). Given either as a date, or "start", "end" or a day of the week, e.g. "mon". |
complete |
truncate incomplete start and end periods |
... |
ignored |
a vector of dates for regular periods between the minimum and maximum of dates, with the boundaries defined by the anchor.
date_seq(as.Date(c("2020-01-01","2020-02-01","2020-01-15","2020-02-01",NA)), "2 days")date_seq(as.Date(c("2020-01-01","2020-02-01","2020-01-15","2020-02-01",NA)), "2 days")
This is useful if you want to fill in missing values that should have been observed but weren't. For example, date_seq(c(1, 2, 4, 6), 1) will return 1:6.
## S3 method for class 'numeric' date_seq(x, period = 1, tol = 1e-06, ...)## S3 method for class 'numeric' date_seq(x, period = 1, tol = 1e-06, ...)
x |
a numeric or date vector |
period |
Gap between each observation. The existing data will be checked to ensure that it is actually of this periodicity. |
tol |
Numerical tolerance for checking periodicity. |
... |
for subtype methods |
a vector of the same type as the input
date_seq(c(1, 2, 4, 5, 10), 1)date_seq(c(1, 2, 4, 5, 10), 1)
time_period vector to the full range of possible timesDerive from a vector of observation time_periods, a complete ordered sequence of
periods in a regular time series, where the length of the periods is
specified, as a number of days, weeks, years etc. E.g. this can convert a
random set of times to a ordered complete list of 1 week intervals (or 2
month intervals) spanning the same range as the dates. This has some
interesting problems regarding where to put breaks within a month or week.
Often this is either based on a specific date (e.g. yearly periods starting
at 2020-01-01) or a day of week (e.g. 2 weekly periods staring on a sunday)
or maybe relative to the input time series (weekly ending on the last date of
the data). There is also a problem when we consider data that may have
incomplete starting and end periods, which may not be comparable to other
periods, and we may need to exclude these from the result.
## S3 method for class 'time_period' date_seq(x, period = attributes(x)$unit, complete = FALSE, ...)## S3 method for class 'time_period' date_seq(x, period = attributes(x)$unit, complete = FALSE, ...)
x |
a time period vector |
period |
the gap between observations as a number of days or as a natural language definition of the period such as "1 week", '2 weeks', '1 month', etc. If not given this will be derived from the dates. |
complete |
truncate incomplete start and end periods |
... |
ignored |
a vector of time_periods for regular periods between the minimum and maximum of
dates, with the boundaries defined by the anchor.
tmp = as.time_period(c(0,10,100), 7, "2020-01-01") date_seq(tmp, "7 days") date_seq(tmp, "1 day")tmp = as.time_period(c(0,10,100), 7, "2020-01-01") date_seq(tmp, "7 days") date_seq(tmp, "1 day")
Density, distribution function, quantile function and random
generation for the Beta distribution with parameters shape1 and
shape2 (and optional non-centrality parameter ncp).
dbeta2(x, prob, kappa, log = FALSE)dbeta2(x, prob, kappa, log = FALSE)
x |
vector of quantiles |
prob |
the mean probability (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
log |
logical; if TRUE, probabilities p are given as log(p). |
dbeta gives the density, pbeta the distribution
function, qbeta the quantile function, and rbeta
generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rbeta, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
dbeta2(c(0.25,0.5,0.75), 0.5, 0.25)dbeta2(c(0.25,0.5,0.75), 0.5, 0.25)
The following conversion describes the parameters mean and kappa
dcgamma(x, mean, kappa = 1/mean, log = FALSE)dcgamma(x, mean, kappa = 1/mean, log = FALSE)
x |
vector of quantiles |
mean |
the mean value on the true scale (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
log |
logical; if TRUE, probabilities p are given as log(p). |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
dcgamma(seq(0,4,0.25), 2, 0.5)dcgamma(seq(0,4,0.25), 2, 0.5)
Density, distribution function, quantile function and random
generation for the Gamma distribution with parameters shape and
scale.
dgamma2(x, mean, sd = sqrt(mean), log = FALSE)dgamma2(x, mean, sd = sqrt(mean), log = FALSE)
x |
vector of quantiles |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
log |
logical; if TRUE, probabilities p are given as log(p). |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
dgamma2(seq(0,4,0.25), 2, 1)dgamma2(seq(0,4,0.25), 2, 1)
Density, distribution function, quantile function and random
generation for the log normal distribution whose logarithm has mean
equal to meanlog and standard deviation equal to sdlog.
dlnorm2(x, mean = 1, sd = sqrt(exp(1) - 1), log = FALSE)dlnorm2(x, mean = 1, sd = sqrt(exp(1) - 1), log = FALSE)
x |
vector of quantiles |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
log |
logical; if TRUE, probabilities p are given as log(p). |
The log normal distribution has density
where and are the mean and standard
deviation of the logarithm.
The mean is ,
the median is , and the variance
and hence the coefficient of variation is
which is
approximately when that is small (e.g., ).
dlnorm gives the density,
plnorm gives the distribution function,
qlnorm gives the quantile function, and
rlnorm generates random deviates.
The length of the result is determined by n for
rlnorm, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
The cumulative hazard
is -plnorm(t, r, lower = FALSE, log = TRUE).
dlnorm is calculated from the definition (in ‘Details’).
[pqr]lnorm are based on the relationship to the normal.
Consequently, they model a single point mass at exp(meanlog)
for the boundary case sdlog = 0.
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. Wadsworth & Brooks/Cole.
Johnson, N. L., Kotz, S. and Balakrishnan, N. (1995) Continuous Univariate Distributions, volume 1, chapter 14. Wiley, New York.
Distributions for other standard distributions, including
dnorm for the normal distribution.
dlnorm2(seq(0,4,0.25), 2, 1)dlnorm2(seq(0,4,0.25), 2, 1)
The logit-normal distribution has a support of 0 to 1.
dlogitnorm(x, meanlogit = 0, sdlogit = 1, log = FALSE)dlogitnorm(x, meanlogit = 0, sdlogit = 1, log = FALSE)
x |
vector of quantiles |
meanlogit |
the mean on the logit scale |
sdlogit |
the sd on the logit scale |
log |
logical; if TRUE, probabilities p are given as log(p). |
a vector of probabilities, quantiles, densities or samples.
dlogitnorm(seq(0.1,0.9,0.1), 0, 1)dlogitnorm(seq(0.1,0.9,0.1), 0, 1)
The logit-normal distribution has a support of 0 to 1.
dlogitnorm2(x, prob.0.5 = 0.5, kappa = 1 - exp(-1), log = FALSE)dlogitnorm2(x, prob.0.5 = 0.5, kappa = 1 - exp(-1), log = FALSE)
x |
vector of quantiles |
prob.0.5 |
the median on the true scale |
kappa |
a dispersion parameter from 0 (none) to 1 maximum dispersion |
log |
logical; if TRUE, probabilities p are given as log(p). |
a vector of probabilities, quantiles, densities or samples.
q = seq(0.1,0.9,0.1) eps = sqrt(.Machine$double.eps) dlogitnorm2(q, 0.75, 0.2) (plogitnorm2(q+eps, 0.75, 0.2) - plogitnorm2(q-eps, 0.75, 0.2)) / (2*eps)q = seq(0.1,0.9,0.1) eps = sqrt(.Machine$double.eps) dlogitnorm2(q, 0.75, 0.2) (plogitnorm2(q+eps, 0.75, 0.2) - plogitnorm2(q-eps, 0.75, 0.2)) / (2*eps)
Density, distribution function, quantile function and random
generation for the negative binomial distribution with parameters
size and prob.
dnbinom2(x, mean, sd = sqrt(mean), log = FALSE)dnbinom2(x, mean, sd = sqrt(mean), log = FALSE)
x |
vector of (non-negative integer) quantiles. |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
log |
logical; if TRUE, probabilities p are given as log(p). |
The negative binomial distribution with size and
prob has density
for , and .
This represents the number of failures which occur in a sequence of
Bernoulli trials before a target number of successes is reached.
The mean is and variance .
A negative binomial distribution can also arise as a mixture of
Poisson distributions with mean distributed as a gamma distribution
(see pgamma) with scale parameter (1 - prob)/prob
and shape parameter size. (This definition allows non-integer
values of size.)
An alternative parametrization (often used in ecology) is by the
mean mu (see above), and size, the dispersion
parameter, where prob = size/(size+mu). The variance
is mu + mu^2/size in this parametrization.
If an element of x is not integer, the result of dnbinom
is zero, with a warning.
The case size == 0 is the distribution concentrated at zero.
This is the limiting distribution for size approaching zero,
even if mu rather than prob is held constant. Notice
though, that the mean of the limit distribution is 0, whatever the
value of mu.
The quantile is defined as the smallest value such that
, where is the distribution function.
dnbinom gives the density,
pnbinom gives the distribution function,
qnbinom gives the quantile function, and
rnbinom generates random deviates.
Invalid size or prob will result in return value
NaN, with a warning.
The length of the result is determined by n for
rnbinom, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rnbinom returns a vector of type integer unless generated
values exceed the maximum representable integer when double
values are returned.
dnbinom computes via binomial probabilities, using code
contributed by Catherine Loader (see dbinom).
pnbinom uses pbeta.
qnbinom uses the Cornish–Fisher Expansion to include a skewness
correction to a normal approximation, followed by a search.
rnbinom uses the derivation as a gamma mixture of Poisson
distributions, see
Devroye, L. (1986) Non-Uniform Random Variate Generation. Springer-Verlag, New York. Page 480.
Distributions for standard distributions, including
dbinom for the binomial, dpois for the
Poisson and dgeom for the geometric distribution, which
is a special case of the negative binomial.
dnbinom2(0:5, 2, sqrt(2))dnbinom2(0:5, 2, sqrt(2))
Null distributions always returns NA
dnull(x, ...)dnull(x, ...)
x |
vector of quantiles |
... |
not used |
dnull(c(0.25,0.5,0.75), 0.5, 0.25)dnull(c(0.25,0.5,0.75), 0.5, 0.25)
The unit of doubling times is always days.
doubling_time(x, ...)doubling_time(x, ...)
x |
proportion or incidence growth rates - EITHER: a dataframe with columns:
Any grouping allowed. OR with columns:
Any grouping allowed. |
... |
not used |
the same dataframe with additional columns for doubling time or relative doubling time plus confidence intervals.
example_poisson_rt_smooth() %>% poisson_locfit_model(window=21) %>% doubling_time() %>% dplyr::glimpse()example_poisson_rt_smooth() %>% poisson_locfit_model(window=21) %>% doubling_time() %>% dplyr::glimpse()
The wedge distribution has a support of 0 to 1 and has a linear probability density function over that support.
dwedge(x, a, lower.tail = TRUE, log = FALSE)dwedge(x, a, lower.tail = TRUE, log = FALSE)
x |
vector of quantiles |
a |
a gradient from -2 (left skewed) to 2 (right skewed) |
lower.tail |
logical; if TRUE (default), probabilities are |
log |
logical; if TRUE, probabilities p are given as log(p). |
The rwedge can be combined with quantile functions to
skew standard distributions, or introduce correlation or down weight
certain parts of the distribution.
a vector of probabilities, quantiles, densities or samples.
pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )
Format date as dmy
fdmy(date)fdmy(date)
date |
a date to convert |
the formatted date
fdmy(Sys.Date())fdmy(Sys.Date())
Print a summary of an infectivity profile
format_ip(ip = i_empirical_ip)format_ip(ip = i_empirical_ip)
ip |
the infectivity profile to summarise.
Minimally grouped by: boot (and other groupings allowed). |
an infectivity profile description
format_ip(example_ganyani_ip())format_ip(example_ganyani_ip())
Delayed GAM reporting model function generator
gam_delayed_reporting( window, max_delay = 40, ..., knots_fn = ~gam_knots(.x, window, ...) )gam_delayed_reporting( window, max_delay = 40, ..., knots_fn = ~gam_knots(.x, window, ...) )
window |
controls the knot spacing in the GAM (if the default) |
max_delay |
the maximum delay we expect to model |
... |
Named arguments passed on to
|
knots_fn |
a function that takes the data as an input and returns
a set of integers as time points for GAM knots, for |
This function is used to configure a delayed reporting GAM model. The model is of the form:
count ~ s(time, bs = "cr", k = length(kts)) + s(log(tau), k = 4, pc = max_delay)
where tau is the difference between time series observation time and the
time of the data point in the time series, and we have multiple observations
of the same time series. This function helps specify the knots of the GAM and
the maximum expected delay
a list with 2 entries - model_fn and predict suitable as the
input for poisson_gam_model(model_fn = ..., predict=...).
data = example_delayed_observation() %>% dplyr::group_by(obs_time) cfg = gam_delayed_reporting(14,40) fit = cfg$model_fn(data) summary(fit)data = example_delayed_observation() %>% dplyr::group_by(obs_time) cfg = gam_delayed_reporting(14,40) fit = cfg$model_fn(data) summary(fit)
At the moment this does nothing sophisticated. An mostly equally spaced grid of knots with gaps at start and end to prevent over-fitting there. In the future this could look at the number of observations or areas where there is a lot of change to add in more knots.
gam_knots(data = i_incidence_data, window, ..., k = NULL)gam_knots(data = i_incidence_data, window, ..., k = NULL)
data |
the function will be called with incidence data - a dataframe with columns:
Any grouping allowed. |
window |
the spacing between knots |
... |
currently not used |
k |
alternative to |
a vector of times (as a numeric)
gam_knots(example_poisson_rt(), 14) gam_knots(example_poisson_rt(), k=10)gam_knots(example_poisson_rt(), 14) gam_knots(example_poisson_rt(), k=10)
This function configures a very simple negative binomial count model and using:
gam_nb_model_fn(window, ..., knots_fn = ~gam_knots(.x, window, ...))gam_nb_model_fn(window, ..., knots_fn = ~gam_knots(.x, window, ...))
window |
controls the knot spacing in the GAM (if the default) |
... |
Named arguments passed on to
|
knots_fn |
a function that takes the data as an input and returns
a set of integers as time points for GAM knots, for |
count ~ s(time, bs = "cr", k = length(kts))
The control of knot positions is defined by the knots_fn and window
parameters.
a function suitable as the input for
poisson_gam_model(model_fn = ...).
model_fn = gam_nb_model_fn(14) fit = model_fn(example_poisson_rt() %>% dplyr::ungroup()) summary(fit)model_fn = gam_nb_model_fn(14) fit = model_fn(example_poisson_rt() %>% dplyr::ungroup()) summary(fit)
This function configures a very simple poisson count model and using:
gam_poisson_model_fn(window, ..., knots_fn = ~gam_knots(.x, window, ...))gam_poisson_model_fn(window, ..., knots_fn = ~gam_knots(.x, window, ...))
window |
controls the knot spacing in the GAM (if the default) |
... |
Named arguments passed on to
|
knots_fn |
a function that takes the data as an input and returns
a set of integers as time points for GAM knots, for |
count ~ s(time, bs = "cr", k = length(kts))
The control of knot positions is defined by the knots_fn and window
parameters.
a function suitable as the input for
poisson_gam_model(model_fn = ...).
model_fn = gam_poisson_model_fn(14) fit = model_fn(example_poisson_rt() %>% dplyr::ungroup()) summary(fit)model_fn = gam_poisson_model_fn(14) fit = model_fn(example_poisson_rt() %>% dplyr::ungroup()) summary(fit)
The x axis must be a date.
geom_events( events = i_events, event_label_size = 7, event_label_colour = "black", event_label_angle = -30, event_line_colour = "grey50", event_fill_colour = "grey50", hide_labels = FALSE, guide_axis = ggplot2::derive(), x_axis_style = c("date", "time_period"), ... )geom_events( events = i_events, event_label_size = 7, event_label_colour = "black", event_label_angle = -30, event_line_colour = "grey50", event_fill_colour = "grey50", hide_labels = FALSE, guide_axis = ggplot2::derive(), x_axis_style = c("date", "time_period"), ... )
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
event_label_size |
how big to make the event label |
event_label_colour |
the event label colour |
event_label_angle |
the event label colour |
event_line_colour |
the event line colour |
event_fill_colour |
the event area fill |
hide_labels |
do not show labels at all |
guide_axis |
a guide axis configuration for the labels
(see |
x_axis_style |
should the x-axis be |
... |
Named arguments passed on to
Named arguments passed on to
|
a set of geoms for a time series.
This assumes a modelled incidence estimate that is log-normal. The exponential growth rate is the first derivative of the mu parameters of this log-normal. On the link scale these are normally distributed. This function assumes that the time series incidence estimates are uncorrelated to estimate the error in the growth rate, which is a conservative approach resulting in more uncertainty in growth rate than might be possible through other methods. This is all based on Savitsky-Golay filters applied to the normally distributed log-incidence estimates.
growth_rate_from_incidence(d = i_incidence_model, window = 7, deg = 2)growth_rate_from_incidence(d = i_incidence_model, window = 7, deg = 2)
d |
a modelled incidence estimate - a dataframe with columns:
Any grouping allowed. |
window |
the width of the Savitsky-Golay filter - must be odd |
deg |
the polynomial degree of the filter |
the timeseries with growth rate columns: A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
growth.fit (double) - an estimate of the growth rate
growth.se.fit (positive_double) - the standard error the growth rate
growth.0.025 (double) - lower confidence limit of the growth rate
growth.0.5 (double) - median estimate of the growth rate
growth.0.975 (double) - upper confidence limit of the growth rate
Any grouping allowed.
data = example_poisson_growth_rate() tmp2 = data %>% poisson_glm_model(window=7,deg=1) %>% growth_rate_from_incidence(window = 13, deg=1) if(interactive()) { plot_incidence(tmp2, date_labels="%b %y") plot_growth_rate( tmp2, date_labels="%b %y" )+ sim_geom_function(data,colour="red") }data = example_poisson_growth_rate() tmp2 = data %>% poisson_glm_model(window=7,deg=1) %>% growth_rate_from_incidence(window = 13, deg=1) if(interactive()) { plot_incidence(tmp2, date_labels="%b %y") plot_growth_rate( tmp2, date_labels="%b %y" )+ sim_geom_function(data,colour="red") }
This assumes a prevalence that is logit-normally distributed. The exponential growth rate is the first derivative of the mu parameters of this logit-normal. On the link scale these are normally distributed. This function assumes that the time series incidence estimates are uncorrelated to estimate the error in the growth rate, which is a conservative approach resulting in more uncertainty in growth rate than might be possible through other methods. This is all based on Savitsky-Golay filters applied to the normally distributed logit-proportion estimates.
growth_rate_from_prevalence(d = i_prevalence_model, window = 7, deg = 2)growth_rate_from_prevalence(d = i_prevalence_model, window = 7, deg = 2)
d |
a modelled proportion estimate - a dataframe with columns:
Any grouping allowed. |
window |
the width of the Savitsky-Golay filter - must be odd |
deg |
the polynomial degree of the filter |
the timeseries with growth rate columns: A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
proportion.fit (double) - an estimate of the proportion on a logit scale
proportion.se.fit (positive_double) - the standard error of proportion estimate on a logit scale
proportion.0.025 (proportion) - lower confidence limit of proportion (true scale)
proportion.0.5 (proportion) - median estimate of proportion (true scale)
proportion.0.975 (proportion) - upper confidence limit of proportion (true scale)
relative.growth.fit (double) - an estimate of the relative growth rate
relative.growth.se.fit (positive_double) - the standard error the relative growth rate
relative.growth.0.025 (double) - lower confidence limit of the relative growth rate
relative.growth.0.5 (double) - median estimate of the relative growth rate
relative.growth.0.975 (double) - upper confidence limit of the relative growth rate
Any grouping allowed.
data = example_poisson_rt_2class() tmp = data %>% proportion_glm_model(window=7,deg=2) %>% dplyr::select(time, dplyr::starts_with("proportion")) %>% dplyr::rename_with(~ gsub("proportion","prevalence",.x)) %>% dplyr::select(-prevalence.fit, -prevalence.se.fit) tmp = tmp %>% growth_rate_from_prevalence(window = 25, deg=1) plot_growth_rate( tmp, date_labels="%b %y" ) data1 = ukc19::ons_infection_survey %>% dplyr::filter(name=="England") %>% timeseries(count=FALSE) tmp2 = data1 %>% growth_rate_from_prevalence(window = 25, deg=1) if(interactive()) { plot_growth_rate( tmp2, date_labels="%b %y", mapping = ggplot2::aes(colour=name), events = ukc19::timeline ) }data = example_poisson_rt_2class() tmp = data %>% proportion_glm_model(window=7,deg=2) %>% dplyr::select(time, dplyr::starts_with("proportion")) %>% dplyr::rename_with(~ gsub("proportion","prevalence",.x)) %>% dplyr::select(-prevalence.fit, -prevalence.se.fit) tmp = tmp %>% growth_rate_from_prevalence(window = 25, deg=1) plot_growth_rate( tmp, date_labels="%b %y" ) data1 = ukc19::ons_infection_survey %>% dplyr::filter(name=="England") %>% timeseries(count=FALSE) tmp2 = data1 %>% growth_rate_from_prevalence(window = 25, deg=1) if(interactive()) { plot_growth_rate( tmp2, date_labels="%b %y", mapping = ggplot2::aes(colour=name), events = ukc19::timeline ) }
This assumes a modelled proportion that is logit-normally distributed. The exponential growth rate is the first derivative of the mu parameters of this logit-normal. On the link scale these are normally distributed. This function assumes that the time series incidence estimates are uncorrelated to estimate the error in the growth rate, which is a conservative approach resulting in more uncertainty in growth rate than might be possible through other methods. This is all based on Savitsky-Golay filters applied to the normally distributed logit-proportion estimates.
growth_rate_from_proportion(d = i_proportion_model, window = 7, deg = 2)growth_rate_from_proportion(d = i_proportion_model, window = 7, deg = 2)
d |
a modelled proportion estimate - a dataframe with columns:
Any grouping allowed. |
window |
the width of the Savitsky-Golay filter - must be odd |
deg |
the polynomial degree of the filter |
the timeseries with growth rate columns: A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
proportion.fit (double) - an estimate of the proportion on a logit scale
proportion.se.fit (positive_double) - the standard error of proportion estimate on a logit scale
proportion.0.025 (proportion) - lower confidence limit of proportion (true scale)
proportion.0.5 (proportion) - median estimate of proportion (true scale)
proportion.0.975 (proportion) - upper confidence limit of proportion (true scale)
relative.growth.fit (double) - an estimate of the relative growth rate
relative.growth.se.fit (positive_double) - the standard error the relative growth rate
relative.growth.0.025 (double) - lower confidence limit of the relative growth rate
relative.growth.0.5 (double) - median estimate of the relative growth rate
relative.growth.0.975 (double) - upper confidence limit of the relative growth rate
Any grouping allowed.
data = example_poisson_rt_2class() tmp2 = data %>% proportion_glm_model(window=7,deg=2) %>% growth_rate_from_proportion(window = 13, deg=1) if(interactive()) { plot_proportion(tmp2, date_labels="%b %y") plot_growth_rate( tmp2, date_labels="%b %y" ) }data = example_poisson_rt_2class() tmp2 = data %>% proportion_glm_model(window=7,deg=2) %>% growth_rate_from_proportion(window = 13, deg=1) if(interactive()) { plot_proportion(tmp2, date_labels="%b %y") plot_growth_rate( tmp2, date_labels="%b %y" ) }
This function augments any timeseries with a population denominator. The population data may be static estimates, or a set of estimates at time points. The population data may be grouped in which case the grouping might be geographical area or age group or gender for example. The two inputs must have compatible grouping (i.e. all the groups in the population data must be present in the timeseries).
infer_population(df = i_timeseries, pop = i_population_data)infer_population(df = i_timeseries, pop = i_population_data)
df |
A time series, or a grouped collection of time series. - a dataframe with columns:
Any grouping allowed. |
pop |
The population data must be grouped in the same way as
Any grouping allowed. |
the df timeseries with additional population column
# The COVID data already has a population column so we are just double checking: data = example_england_covid_by_age() %>% dplyr::rename(pop_old = population) demog = ukc19::uk_population_2019_by_5yr_age %>% dplyr::group_by(code, name, class) data %>% infer_population(demog) %>% dplyr::glimpse()# The COVID data already has a population column so we are just double checking: data = example_england_covid_by_age() %>% dplyr::rename(pop_old = population) demog = ukc19::uk_population_2019_by_5yr_age %>% dplyr::group_by(code, name, class) data %>% infer_population(demog) %>% dplyr::glimpse()
Log-scaled incidence estimates are used to generate samples of incidence. These are convolved with the duration of infection (as the probability still infected) on any given day (as a set of discrete distributions) and the result is evaluated as a fraction of population. The result is fitted to a logit-normal (using moments) and quantiles produced from samples. If test sensitivity is used instead of duration of infection then the prevalence estimate will be the expected proportion of test positives by screening test rather than a true prevalence. Differences between this and the proportion of test positives observed may be due to ascertainment bias or test sensitivity misspecification.
infer_prevalence( modelled = i_incidence_model, pop = i_population_data, ip = i_discrete_ip, bootstraps = 1000, seed = Sys.time(), ... )infer_prevalence( modelled = i_incidence_model, pop = i_population_data, ip = i_discrete_ip, bootstraps = 1000, seed = Sys.time(), ... )
modelled |
Model output from processing the
Any grouping allowed. |
pop |
The population data must be grouped in the same way as
Any grouping allowed. |
ip |
A discrete distribution representing the duration of infection (or probability of detection of infection). This will not sum to one. - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
bootstraps |
the number of samples to take at each time point. This will be rounded up to a whole multiple of the infection duration distribution length. |
seed |
a random number seed for reproducibility |
... |
not used |
the modelled input with additional proportion columns
tmp = example_poisson_rt_smooth() %>% poisson_locfit_model(window=14) %>% infer_prevalence( pop = 10000, ip = example_ip() ) if(interactive()) { plot_prevalence(tmp) }tmp = example_poisson_rt_smooth() %>% poisson_locfit_model(window=14) %>% infer_prevalence( pop = 10000, ip = example_ip() ) if(interactive()) { plot_prevalence(tmp) }
This enables incidence rates are able to be compared to a baseline figure for
incidence. The baseline could come for example from a population average or
average incidence over time. The output is an incidence rate ratio. The
incidence_baseline column is a rate of events per unit time. The time unit
is expected to be the same as that of the date in modelled and this is not
checked.
infer_rate_ratio( modelled = i_incidence_model, base = i_baseline_incidence_data, ... )infer_rate_ratio( modelled = i_incidence_model, base = i_baseline_incidence_data, ... )
modelled |
Model output from something like
Any grouping allowed. |
base |
The baseline data must be grouped in the same way as
Any grouping allowed. |
... |
not used |
a dataframe with incidence rate ratios for each of the classes in modelled. A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rate_ratio.0.025 (positive_double) - lower confidence limit of the rate ratio for a population group
rate_ratio.0.5 (positive_double) - median estimate of the rate ratio for a population group
rate_ratio.0.975 (positive_double) - upper confidence limit of the rate ratio for a population group
Any grouping allowed.
# not age stratified baseline = example_poisson_locfit() %>% dplyr::mutate(baseline_incidence = incidence.0.5) # age stratified rate ratios: tmp = example_poisson_age_stratified() %>% infer_rate_ratio(baseline) %>% dplyr::glimpse()# not age stratified baseline = example_poisson_locfit() %>% dplyr::mutate(baseline_incidence = incidence.0.5) # age stratified rate ratios: tmp = example_poisson_age_stratified() %>% infer_rate_ratio(baseline) %>% dplyr::glimpse()
This assumes that for example, case distribution proportions are stratified by a population grouping, e.g. geography or age, and we have estimates of the size of that population during that time period. Normalising by population proportion allows us to compare the relative risk of the outcome in groups, compared to the expected population risk if the outcome is evenly distributed across the population. There may be other proportions other than population fractions that may be useful to compare. At the moment this does not handle any uncertainty.
infer_risk_ratio( modelled = i_proportion_model, base = i_baseline_proportion_data, ... )infer_risk_ratio( modelled = i_proportion_model, base = i_baseline_proportion_data, ... )
modelled |
Model output from something like
Any grouping allowed. |
base |
The baseline data must be grouped in the same way as
Any grouping allowed. |
... |
not used |
a dataframe with relative risk / risk ratio columns. A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
risk_ratio.0.025 (positive_double) - lower confidence limit of the excess risk ratio for a population group
risk_ratio.0.5 (positive_double) - median estimate of the excess risk ratio for a population group
risk_ratio.0.975 (positive_double) - upper confidence limit of the excess risk ratio for a population group
Any grouping allowed.
demog = ukc19::uk_population_2019_by_5yr_age %>% dplyr::filter(name == "England") tmp = example_proportion_age_stratified() %>% infer_risk_ratio(demog) %>% dplyr::glimpse() if(interactive()) { plot_growth_phase(tmp,duration = 14*4)+ ggplot2::scale_colour_viridis_d()+ ggplot2::coord_cartesian(xlim=c(-0.25,0.25),ylim=c(0.02,20)) }demog = ukc19::uk_population_2019_by_5yr_age %>% dplyr::filter(name == "England") tmp = example_proportion_age_stratified() %>% infer_risk_ratio(demog) %>% dplyr::glimpse() if(interactive()) { plot_growth_phase(tmp,duration = 14*4)+ ggplot2::scale_colour_viridis_d()+ ggplot2::coord_cartesian(xlim=c(-0.25,0.25),ylim=c(0.02,20)) }
Strictly integer breaks for continuous scale
integer_breaks(n = 5, ...)integer_breaks(n = 5, ...)
n |
number of breaks |
... |
further arguments for methods. |
a ggplot breaks function
This solves the relationship between $R_t$ and growth rates as described by Wallinga and Lipsitch, to get a growth rate from $R_t$ and infectivity profile.
inv_wallinga_lipsitch( Rt, y = i_empirical_ip, a1 = seq(0.5, length.out = length(y)), a0 = dplyr::lag(a1, default = 0) )inv_wallinga_lipsitch( Rt, y = i_empirical_ip, a1 = seq(0.5, length.out = length(y)), a0 = dplyr::lag(a1, default = 0) )
Rt |
a vector of reproduction numbers |
y |
an empirical infectivity profile as a probability vector or as a dataframe of format: A dataframe containing the following columns:
Minimally grouped by: boot (and other groupings allowed). |
a1 |
the end time of the infectivity profile probability estimate (defaults to 0.5,1.5,2.5,...). |
a0 |
the start time of the infectivity profile probability estimate (defaults to 0,0.5,1.5,...). |
This function uses a single empirical distribution for the infectivity profile / generation time. If multiple are provided then the average central value is chosen (i.e. this does not propagate uncertainty in infectivity profile)
an vector of growth rates
inv_wallinga_lipsitch(Rt=seq(0.5,2.5,length.out=9), y=example_ip())inv_wallinga_lipsitch(Rt=seq(0.5,2.5,length.out=9), y=example_ip())
Check whether vector is a date
is.Date(x)is.Date(x)
x |
a vector to check |
TRUE if dates, FALSE otherwise
is.Date(Sys.Date())is.Date(Sys.Date())
Extract the weekday, month or quarter, or the Julian time (days since some origin). These are generic functions: the methods for the internal date-time classes are documented here.
## S3 method for class 'time_period' julian(x, ...)## S3 method for class 'time_period' julian(x, ...)
x |
an object inheriting from class |
... |
arguments for other methods. |
weekdays and months return a character
vector of names in the locale in use, i.e., Sys.getlocale("LC_TIME").
quarters returns a character vector of "Q1" to
"Q4".
julian returns the number of days (possibly fractional)
since the origin, with the origin as a "origin" attribute.
All time calculations in R are done ignoring leap-seconds.
Other components such as the day of the month or the year are
very easy to compute: just use as.POSIXlt and extract
the relevant component. Alternatively (especially if the components
are desired as character strings), use strftime.
DateTimeClasses, Date;
Sys.getlocale("LC_TIME") crucially for months() and weekdays().
## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time
Create a set of labels for a time period based on the start and duration of
the period. The format is configurable using the start and end dates and the
dfmt and ifmt parameters, however if the time period has names then these
are used in preference.
## S3 method for class 'time_period' labels( object, ..., dfmt = "%d/%b", ifmt = "{start} — {end}", na.value = "Unknown" )## S3 method for class 'time_period' labels( object, ..., dfmt = "%d/%b", ifmt = "{start} — {end}", na.value = "Unknown" )
object |
a set of decimal times as a time_period |
... |
not used |
dfmt |
a |
ifmt |
a glue spec referring to |
na.value |
a label for |
a set of character labels for the time
eg = as.time_period(Sys.Date()+0:10*7, unit="1 week", anchor="start") labels(eg) labels(eg, ifmt="{start}", dfmt="%d/%b/%y") labels(eg, ifmt="until {end}", dfmt="%d %b %Y") # labels retained in constructor: eg2 = Sys.Date()+0:10*7 names(eg2) = paste0("week ",0:10) labels(eg2) labels(as.time_period(eg2, anchor="start"))eg = as.time_period(Sys.Date()+0:10*7, unit="1 week", anchor="start") labels(eg) labels(eg, ifmt="{start}", dfmt="%d/%b/%y") labels(eg, ifmt="until {end}", dfmt="%d %b %Y") # labels retained in constructor: eg2 = Sys.Date()+0:10*7 names(eg2) = paste0("week ",0:10) labels(eg2) labels(as.time_period(eg2, anchor="start"))
ggoutbreak compatible case linelist.Coerce an object to a ggoutbreak compatible case linelist.
linelist(x, ...)linelist(x, ...)
x |
An object to coerce |
... |
Named arguments passed on to
|
minimally a time stamped linelist dataframe A dataframe containing the following columns:
time (ggoutbreak::time_period) - A set of events with a timestamp as a time_period
Any grouping allowed.
(Sys.Date()+stats::runif(100)*7) %>% linelist()(Sys.Date()+stats::runif(100)*7) %>% linelist()
Perform logit scaling with correct axis formatting. To not be used directly but with ggplot (e.g. ggplot2::scale_y_continuous(trans = "logit") )
logit_trans(n = 5, ...)logit_trans(n = 5, ...)
n |
the number of breaks |
... |
not used, for compatibility |
A scales object
dplyr::tibble(pvalue = c(0.001, 0.05, 0.1), fold_change = 1:3) %>% ggplot2::ggplot(ggplot2::aes(fold_change , pvalue)) + ggplot2::geom_point() + ggplot2::scale_y_continuous(trans = "logit")dplyr::tibble(pvalue = c(0.001, 0.05, 0.1), fold_change = 1:3) %>% ggplot2::ggplot(ggplot2::aes(fold_change , pvalue)) + ggplot2::geom_point() + ggplot2::scale_y_continuous(trans = "logit")
EpiEstim style matrixRecover a long format infectivity profile from an EpiEstim style matrix
make_empirical_ip(omega, normalise = TRUE)make_empirical_ip(omega, normalise = TRUE)
omega |
a matrix of probabilities, starting at time zero, with columns representing one possible infectivity profile, with the fist value being the probability at time zero (to 0.5). Alternatively this can be a vector of probabilities for one single profile, resulting in 1 bootstrap. |
normalise |
is this a probability mass function? In which case we make the
sum of this equal to one (the default). If |
a long format ip delay distribution
format_ip(make_empirical_ip(c(0,0,1,1,1,2,2,2,1,1)))format_ip(make_empirical_ip(c(0,0,1,1,1,2,2,2,1,1)))
Generate a simple discrete infectivity profile from a gamma distribution
make_fixed_ip(mean, sd = sqrt(mean), epiestim_compat = FALSE)make_fixed_ip(mean, sd = sqrt(mean), epiestim_compat = FALSE)
mean |
The mean of a gamma distribution |
sd |
The sd of a gamma distribution |
epiestim_compat |
Should the discretisation support EpiEstim
(i.e. |
a discrete infectivity profile with 1 bootstrap
tmp = make_fixed_ip(5,2) if(interactive()) { plot_ip(tmp, alpha=1) }tmp = make_fixed_ip(5,2) if(interactive()) { plot_ip(tmp, alpha=1) }
The infectivity profile is typically fitted to data by MCMC and reported as median and 95% credible intervals, of the mean, and the SD of (usually) a gamma distribution. This function generates a discrete infectivity probability distribution representing the chance that an infectee was infected on any specific day after the infector was infected (given that the infectee was infected).
make_gamma_ip( median_of_mean, lower_ci_of_mean = median_of_mean, upper_ci_of_mean = median_of_mean, median_of_sd = sqrt(median_of_mean), lower_ci_of_sd = median_of_sd, upper_ci_of_sd = median_of_sd, correlation = NA, n_boots = 100, epiestim_compat = FALSE, epiestim_sampler = epiestim_compat, z_crit = 0.95, seed = Sys.time() )make_gamma_ip( median_of_mean, lower_ci_of_mean = median_of_mean, upper_ci_of_mean = median_of_mean, median_of_sd = sqrt(median_of_mean), lower_ci_of_sd = median_of_sd, upper_ci_of_sd = median_of_sd, correlation = NA, n_boots = 100, epiestim_compat = FALSE, epiestim_sampler = epiestim_compat, z_crit = 0.95, seed = Sys.time() )
median_of_mean, lower_ci_of_mean, upper_ci_of_mean
|
Quantiles of the infectivity profile mean. |
median_of_sd, lower_ci_of_sd, upper_ci_of_sd
|
Quantiles of the infectivity profile SD. |
correlation |
the correlation between mean and sd. this is optional and will be inferred if not provided. |
n_boots |
The number of samples to generate. |
epiestim_compat |
Use |
epiestim_sampler |
Use |
z_crit |
the width of the confidence intervals (defaults to 95%). |
seed |
a RNG seed |
EpiEstim generates these distributions by sampling from a truncated normal
distribution for both mean and sd. The means and sds thus produced are
discretised using a gamma distribution offset by 1 day, to enforce that the
probability of infection on day zero is zero.
This constraint changes the shape of the distribution somewhat and may cause
a small bias (although there is no ground truth to evaluate). In this
function two different sampling and discretisation strategy are provided. The
sampler uses log-normal distributions for both mean and SD with a degree of
correlation. The discretizer assigns probabilities direct from the CDF of the
gamma distribution without offset. This results in non zero values for the
probability at time zero and can only be used with Rt estimation methods that
can handle zero/negative serial intervals (e.g. rt_from_incidence or
rt_from_renewal, or rt_from_growth_rate). The alternative follows
EpiEstims algorithm.
a long format infectivity profile data frame, or a list of dataframes if input is a vector.
# COVID-19 estimates from Ganyani et al 2020. tmp = make_gamma_ip(5.2, 3.78, 6.78, 1.72, 0.91, 3.93, epiestim_sampler=FALSE, epiestim_compat=FALSE) tmp %>% dplyr::group_by(boot) %>% dplyr::summarise( mean = sum(tau*probability), sd = sqrt(sum((tau-sum(tau*probability))^2*probability)) ) %>% dplyr::summarise( mean = sprintf("%1.2f [%1.2f-%1.2f]", stats::quantile(mean,0.5), stats::quantile(mean,0.025), stats::quantile(mean,0.975)), sd = sprintf("%1.2f [%1.2f-%1.2f]", stats::quantile(sd,0.5), stats::quantile(sd,0.025), stats::quantile(sd,0.975)) ) if(interactive()) { plot_ip(tmp, alpha=0.1) + ggplot2::coord_cartesian(xlim=c(0,15)) } means = c(3,4,5) ips = make_gamma_ip(means) if (interactive()) { purrr::map2(ips,means, ~ .x %>% dplyr::mutate(label = sprintf("Mean: %1.2f",.y))) %>% purrr::map( ~ plot_ip(.x,alpha=0.1)+ggplot2::facet_wrap(~label)) }# COVID-19 estimates from Ganyani et al 2020. tmp = make_gamma_ip(5.2, 3.78, 6.78, 1.72, 0.91, 3.93, epiestim_sampler=FALSE, epiestim_compat=FALSE) tmp %>% dplyr::group_by(boot) %>% dplyr::summarise( mean = sum(tau*probability), sd = sqrt(sum((tau-sum(tau*probability))^2*probability)) ) %>% dplyr::summarise( mean = sprintf("%1.2f [%1.2f-%1.2f]", stats::quantile(mean,0.5), stats::quantile(mean,0.025), stats::quantile(mean,0.975)), sd = sprintf("%1.2f [%1.2f-%1.2f]", stats::quantile(sd,0.5), stats::quantile(sd,0.025), stats::quantile(sd,0.975)) ) if(interactive()) { plot_ip(tmp, alpha=0.1) + ggplot2::coord_cartesian(xlim=c(0,15)) } means = c(3,4,5) ips = make_gamma_ip(means) if (interactive()) { purrr::map2(ips,means, ~ .x %>% dplyr::mutate(label = sprintf("Mean: %1.2f",.y))) %>% purrr::map( ~ plot_ip(.x,alpha=0.1)+ggplot2::facet_wrap(~label)) }
The infectivity profile is typically fitted to data by MCMC as a gamma distribution. This function generates a discrete infectivity probability distribution representing the chance that an infectee was infected on any specific day after the infector was infected (given that the infectee was infected), from posterior samples.
make_posterior_ip( ..., mean, sd, shape, rate, scale, epiestim_compat = FALSE, n_boots = 100 )make_posterior_ip( ..., mean, sd, shape, rate, scale, epiestim_compat = FALSE, n_boots = 100 )
... |
not used, must be empty |
mean |
a vector of gamma distribution means |
sd |
a vector of gamma distribution sds |
shape |
a vector of gamma distribution shape parameters |
rate |
a vector of gamma distribution rate parameters |
scale |
a vector of gamma distribution scale parameters |
epiestim_compat |
Use |
n_boots |
if there are more posterior samples than this limit then a maximum
of |
If using EpiEstim and coarseDataTools::dic.fit.mcmc the output of the
MCMC will be a S4 object with a samples slot, containing a dataframe of
shape=var1 and scale=var2 columns. to use this output with
make_posterior_ip invoke it like this:
do.call(make_posterior_ip, SI_fit_clever@samples %>% dplyr::rename(shape=var1, scale=var2))
N.b. only one combination of mean and sd, shape and rate, or shape and scale, are required.
a long format ip delay distribution
tmp = make_posterior_ip( mean = stats::rnorm(100,5,0.1), sd = stats::rnorm(100,1.5,0.1) ) tmp %>% dplyr::glimpse() if (interactive()) plot_ip(tmp)tmp = make_posterior_ip( mean = stats::rnorm(100,5,0.1), sd = stats::rnorm(100,1.5,0.1) ) tmp %>% dplyr::glimpse() if (interactive()) plot_ip(tmp)
Suits larger contact tracing data sets where there is a delay between 2 events which may or may not be precisely known.
make_resampled_ip( tau, min_tau = pmax(tau - 0.5, truncate), max_tau = tau + 0.5, add_noise = TRUE, truncate = 0, n_boots = 100, seed = Sys.time() )make_resampled_ip( tau, min_tau = pmax(tau - 0.5, truncate), max_tau = tau + 0.5, add_noise = TRUE, truncate = 0, n_boots = 100, seed = Sys.time() )
tau |
the delay between first and second events |
min_tau |
the minimum delay for interval censored delays |
max_tau |
the maximum delay for interval censored delays |
add_noise |
adds noise to each date point for each replicate |
truncate |
what is the minimum realistic value of the parameter |
n_boots |
number of replicates to generate |
seed |
a random number seed for reproducibility |
a long format ip delay distribution
tau = rgamma2(100, 5,2) ip = make_resampled_ip(min_tau = tau-1, max_tau = tau+1, seed = 100) if(interactive()) { plot_ip(ip,alpha=0.1) }tau = rgamma2(100, 5,2) ip = make_resampled_ip(min_tau = tau-1, max_tau = tau+1, seed = 100) if(interactive()) { plot_ip(ip,alpha=0.1) }
max.Date returns an integer and -Inf for a set of NA dates. This
is usually inconvenient.
max_date(x, ...)max_date(x, ...)
x |
a vector of dates |
... |
ignored |
a date. '0001-01-01“ if there is no well defined minimum.
max_date(NA)max_date(NA)
min.Date returns an integer and Inf for a set of NA dates. This
is usually inconvenient.
min_date(x, ...)min_date(x, ...)
x |
a vector of dates |
... |
ignored |
a date. 9999-12-31 if there is no well defined minimum.
min_date(NA)min_date(NA)
Extract the weekday, month or quarter, or the Julian time (days since some origin). These are generic functions: the methods for the internal date-time classes are documented here.
## S3 method for class 'time_period' months(x, abbreviate = FALSE)## S3 method for class 'time_period' months(x, abbreviate = FALSE)
x |
an object inheriting from class |
abbreviate |
logical vector (possibly recycled). Should the names be abbreviated? |
weekdays and months return a character
vector of names in the locale in use, i.e., Sys.getlocale("LC_TIME").
quarters returns a character vector of "Q1" to
"Q4".
julian returns the number of days (possibly fractional)
since the origin, with the origin as a "origin" attribute.
All time calculations in R are done ignoring leap-seconds.
Other components such as the day of the month or the year are
very easy to compute: just use as.POSIXlt and extract
the relevant component. Alternatively (especially if the components
are desired as character strings), use strftime.
DateTimeClasses, Date;
Sys.getlocale("LC_TIME") crucially for months() and weekdays().
## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time
Takes a list of times, classes and counts, e.g. a COGUK variant like data set
with time, (multinomial) class (e.g. variant) and count being the count in
that time period. Fits a quadratic B-spline on time to the proportion of the
data using nnet::multinom, with approx one degree of freedom per class and per
window units of the time series.
multinomial_nnet_model( d = i_multinomial_input, ..., window = 14, frequency = "1 day", predict = TRUE, .progress = interactive() )multinomial_nnet_model( d = i_multinomial_input, ..., window = 14, frequency = "1 day", predict = TRUE, .progress = interactive() )
d |
A multi-class count input dataframe - a dataframe with columns:
Minimally grouped by: class (and other groupings allowed). |
... |
not used |
window |
a number of data points between knots, smaller values result in less smoothing, large value in more. |
frequency |
the temporal density of the output estimates. |
predict |
result a prediction. If false we return the model. |
.progress |
show a CLI progress bar |
Additional groupings are treated as distinct proportions models.
a new dataframe with time (as a time period), class, and proportion.0.5, or a model object
data = example_poisson_rt_2class() tmp = data %>% multinomial_nnet_model(window=14) if (interactive()) { plot_multinomial(tmp, date_labels="%b %y") }data = example_poisson_rt_2class() tmp = data %>% multinomial_nnet_model(window=14) if (interactive()) { plot_multinomial(tmp, date_labels="%b %y") }
This assumes positive disease counts are stratified by a population grouping, e.g. geography or age, and we have estimates of the size of that population during that time period. Normalising by population size allows us to compare groups.
normalise_count( raw = i_incidence_data, pop = i_population_data, ..., population_unit = 1e+05, normalise_time = FALSE )normalise_count( raw = i_incidence_data, pop = i_population_data, ..., population_unit = 1e+05, normalise_time = FALSE )
raw |
The count data - a dataframe with columns:
Any grouping allowed. |
pop |
The population data must be grouped in the same way as
Any grouping allowed. |
... |
not used |
population_unit |
What population unit do you want the count data normalised to e.g. per 100K |
normalise_time |
The default behaviour for normalising is to keep it in
the same time units as the input data. If this parameter is set to |
a dataframe with incidence rates per unit capita. A dataframe containing the following columns:
population (positive_integer) - Size of population
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
Any grouping allowed.
data = example_england_covid_by_age() demog = ukc19::uk_population_2019_by_5yr_age data %>% normalise_count(demog) %>% dplyr::glimpse()data = example_england_covid_by_age() demog = ukc19::uk_population_2019_by_5yr_age data %>% normalise_count(demog) %>% dplyr::glimpse()
This assumes positive disease counts are stratified by a population grouping, e.g. geography or age, and we have estimates of the size of that population during that time period. Normalising by population size allows us to compare groups.
normalise_incidence( modelled = i_incidence_model, pop = i_population_data, ..., population_unit = 1e+05, normalise_time = FALSE )normalise_incidence( modelled = i_incidence_model, pop = i_population_data, ..., population_unit = 1e+05, normalise_time = FALSE )
modelled |
Model output from processing the
Any grouping allowed. |
pop |
The population data must be grouped in the same way as
Any grouping allowed. |
... |
not used |
population_unit |
what population unit do you want the incidence in e.g. per 100K |
normalise_time |
The default behaviour for incidence is to keep it in
the same time units as the input data. If this parameter is set to |
a dataframe with incidence rates per unit capita. A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
incidence.per_capita.fit (double) - an estimate of the incidence per capita rate on a log scale
incidence.per_capita.se.fit (positive_double) - the standard error of the incidence per capita rate estimate on a log scale
incidence.per_capita.0.025 (positive_double) - lower confidence limit of the incidence per capita rate (true scale)
incidence.per_capita.0.5 (positive_double) - median estimate of the incidence per capita rate (true scale)
incidence.per_capita.0.975 (positive_double) - upper confidence limit of the incidence per capita rate (true scale)
population_unit (double) - The population unit on which the per capita incidence rate is calculated
time_unit (lubridate::as.period) - The time period over which the per capita incidence rate is calculated
Any grouping allowed.
model = example_poisson_age_stratified() demog = ukc19::uk_population_2019_by_5yr_age model %>% normalise_incidence(demog) %>% dplyr::glimpse()model = example_poisson_age_stratified() demog = ukc19::uk_population_2019_by_5yr_age model %>% normalise_incidence(demog) %>% dplyr::glimpse()
This converts a long format infectivity profile to something that will work
with EpiEstim. In general ggoutbreak will do the conversion internally.
This is provided as a utility for examples mostly.
omega_matrix(ip = i_discrete_ip, epiestim_compat = TRUE)omega_matrix(ip = i_discrete_ip, epiestim_compat = TRUE)
ip |
long format infectivity profile - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
epiestim_compat |
ensure the matrix works with |
a matrix with 0:max(tau) rows and 1:max(boot) columns.
omega_matrix(example_ip())omega_matrix(example_ip())
Density, distribution function, quantile function and random
generation for the Beta distribution with parameters shape1 and
shape2 (and optional non-centrality parameter ncp).
pbeta2(q, prob, kappa, lower.tail = TRUE, log.p = FALSE)pbeta2(q, prob, kappa, lower.tail = TRUE, log.p = FALSE)
q |
vector of quantiles |
prob |
the mean probability (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
dbeta gives the density, pbeta the distribution
function, qbeta the quantile function, and rbeta
generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rbeta, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
pbeta2(c(0.25,0.5,0.75), 0.5, 0.25)pbeta2(c(0.25,0.5,0.75), 0.5, 0.25)
The following conversion describes the parameters mean and kappa
pcgamma(q, mean, kappa = 1/mean, lower.tail = TRUE, log.p = FALSE)pcgamma(q, mean, kappa = 1/mean, lower.tail = TRUE, log.p = FALSE)
q |
vector of quantiles |
mean |
the mean value on the true scale (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
pcgamma(seq(0,4,0.25), 2, 0.5)pcgamma(seq(0,4,0.25), 2, 0.5)
Density, distribution function, quantile function and random
generation for the Gamma distribution with parameters shape and
scale.
pgamma2(q, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)pgamma2(q, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)
q |
vector of quantiles |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
pgamma2(seq(0,4,0.25), 2, 1)pgamma2(seq(0,4,0.25), 2, 1)
Density, distribution function, quantile function and random
generation for the log normal distribution whose logarithm has mean
equal to meanlog and standard deviation equal to sdlog.
plnorm2(q, mean = 1, sd = sqrt(exp(1) - 1), lower.tail = TRUE, log.p = FALSE)plnorm2(q, mean = 1, sd = sqrt(exp(1) - 1), lower.tail = TRUE, log.p = FALSE)
q |
vector of quantiles |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are
|
log.p |
logical; if TRUE, probabilities p are given as log(p). |
The log normal distribution has density
where and are the mean and standard
deviation of the logarithm.
The mean is ,
the median is , and the variance
and hence the coefficient of variation is
which is
approximately when that is small (e.g., ).
dlnorm gives the density,
plnorm gives the distribution function,
qlnorm gives the quantile function, and
rlnorm generates random deviates.
The length of the result is determined by n for
rlnorm, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
The cumulative hazard
is -plnorm(t, r, lower = FALSE, log = TRUE).
dlnorm is calculated from the definition (in ‘Details’).
[pqr]lnorm are based on the relationship to the normal.
Consequently, they model a single point mass at exp(meanlog)
for the boundary case sdlog = 0.
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. Wadsworth & Brooks/Cole.
Johnson, N. L., Kotz, S. and Balakrishnan, N. (1995) Continuous Univariate Distributions, volume 1, chapter 14. Wiley, New York.
Distributions for other standard distributions, including
dnorm for the normal distribution.
plnorm2(seq(0,4,0.25), 2, 1)plnorm2(seq(0,4,0.25), 2, 1)
The logit-normal distribution has a support of 0 to 1.
plogitnorm(q, meanlogit = 0, sdlogit = 1, lower.tail = TRUE, log.p = FALSE)plogitnorm(q, meanlogit = 0, sdlogit = 1, lower.tail = TRUE, log.p = FALSE)
q |
vector of quantiles |
meanlogit |
the mean on the logit scale |
sdlogit |
the sd on the logit scale |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
a vector of probabilities, quantiles, densities or samples.
plogitnorm(seq(0.1,0.9,0.1), 0, 1)plogitnorm(seq(0.1,0.9,0.1), 0, 1)
The logit-normal distribution has a support of 0 to 1.
plogitnorm2( q, prob.0.5 = 0.5, kappa = 1 - exp(-1), lower.tail = TRUE, log.p = FALSE )plogitnorm2( q, prob.0.5 = 0.5, kappa = 1 - exp(-1), lower.tail = TRUE, log.p = FALSE )
q |
vector of quantiles |
prob.0.5 |
the median on the true scale |
kappa |
a dispersion parameter from 0 (none) to 1 maximum dispersion |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
a vector of probabilities, quantiles, densities or samples.
plogitnorm2(seq(0.1,0.9,0.1), 0.75, 0.2) plogitnorm2(qlogitnorm2(seq(0.1,0.9,0.1), 0.75, 0.2), 0.75, 0.2)plogitnorm2(seq(0.1,0.9,0.1), 0.75, 0.2) plogitnorm2(qlogitnorm2(seq(0.1,0.9,0.1), 0.75, 0.2), 0.75, 0.2)
Plot a line-list of cases as a histogram
plot_cases( raw, ..., mapping = .check_for_aes(raw, ..., class_aes = "fill"), events = i_events )plot_cases( raw, ..., mapping = .check_for_aes(raw, ..., class_aes = "fill"), events = i_events )
raw |
The raw case data either as a summarised count or as a line-list - EITHER: a dataframe with columns:
Any grouping allowed. OR with columns:
Any grouping allowed. |
... |
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot object
with_defaults("2025-01-01" ,"1 day", { tmp = dplyr::tibble( time = as.time_period( rexpgrowth(100,0.05,40)), class = rep(c("one","two","three"), length.out=100) ) }) if(interactive()) { plot_cases(tmp, mapping=ggplot2::aes(fill = class),linewidth=0.1,colour="white") }with_defaults("2025-01-01" ,"1 day", { tmp = dplyr::tibble( time = as.time_period( rexpgrowth(100,0.05,40)), class = rep(c("one","two","three"), length.out=100) ) }) if(interactive()) { plot_cases(tmp, mapping=ggplot2::aes(fill = class),linewidth=0.1,colour="white") }
Plot a raw case count timeseries
plot_counts(raw, ..., mapping = .check_for_aes(raw, ...), events = i_events)plot_counts(raw, ..., mapping = .check_for_aes(raw, ...), events = i_events)
raw |
The raw count data, or the raw count data normalised by population (
see
Any grouping allowed. OR with columns:
Any grouping allowed. |
... |
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot object
# example code tmp = example_england_covid_by_age() %>% time_aggregate(count=sum(count)) %>% normalise_count(pop=56489700, population_unit=1000, normalise_time=TRUE) # normalised by England population (56489700 people) if(interactive()) { plot_counts(tmp, colour="blue",size=0.25) }# example code tmp = example_england_covid_by_age() %>% time_aggregate(count=sum(count)) %>% normalise_count(pop=56489700, population_unit=1000, normalise_time=TRUE) # normalised by England population (56489700 people) if(interactive()) { plot_counts(tmp, colour="blue",size=0.25) }
Plot an incidence or proportion versus growth phase diagram
plot_growth_phase( modelled, timepoints = NULL, duration = max(dplyr::count(modelled)$n), interval = 7, mapping = if (interfacer::is_col_present(modelled, class)) { ggplot2::aes(colour = class) } else { ggplot2::aes() }, cis = TRUE, ... )plot_growth_phase( modelled, timepoints = NULL, duration = max(dplyr::count(modelled)$n), interval = 7, mapping = if (interfacer::is_col_present(modelled, class)) { ggplot2::aes(colour = class) } else { ggplot2::aes() }, cis = TRUE, ... )
modelled |
Growth rates and incidence / proportion timeseries as the
outputs of functions such as
Any grouping allowed. OR with columns:
Any grouping allowed. OR with columns:
Any grouping allowed. OR with columns:
Any grouping allowed. OR with NULL |
timepoints |
time points (as |
duration |
the length of the growth rate phase trail |
interval |
the length of time between markers on the phase plot |
mapping |
a |
cis |
logical; should the phases be marked with confidence intervals? |
... |
Named arguments passed on to
Named arguments passed on to
|
a ggplot
data = example_poisson_rt_2class() tmp2 = data %>% poisson_locfit_model() timepoints = as.Date(tmp2$time[c(40,80,120,160)]) if(interactive()) { plot_growth_phase(tmp2, timepoints, duration=108) }data = example_poisson_rt_2class() tmp2 = data %>% poisson_locfit_model() timepoints = as.Date(tmp2$time[c(40,80,120,160)]) if(interactive()) { plot_growth_phase(tmp2, timepoints, duration=108) }
Growth rate timeseries diagram
plot_growth_rate( modelled, ..., mapping = .check_for_aes(modelled, ...), events = i_events )plot_growth_rate( modelled, ..., mapping = .check_for_aes(modelled, ...), events = i_events )
modelled |
a growth rate dataframe. - EITHER: a dataframe with columns:
Any grouping allowed. OR with columns:
Any grouping allowed. |
... |
Named arguments passed on to
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot
data = example_poisson_rt_2class() tmp2 = data %>% poisson_locfit_model() if(interactive()) { plot_growth_rate(tmp2) }data = example_poisson_rt_2class() tmp2 = data %>% poisson_locfit_model() if(interactive()) { plot_growth_rate(tmp2) }
Plot an incidence timeseries
plot_incidence( modelled, raw = i_incidence_data, ..., mapping = .check_for_aes(modelled, ...), events = i_events )plot_incidence( modelled, raw = i_incidence_data, ..., mapping = .check_for_aes(modelled, ...), events = i_events )
modelled |
An optional estimate of the incidence time series. If
Any grouping allowed. OR with columns:
Any grouping allowed. |
raw |
The raw count data (optional - if given overlaid as points on top of modelled estimate) - a dataframe with columns:
Any grouping allowed. |
... |
Named arguments passed on to
Named arguments passed on to
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot object
# example code tmp = example_poisson_rt_2class() tmp2 = tmp %>% poisson_locfit_model() if(interactive()) { plot_incidence(tmp2,tmp,size=0.25) }# example code tmp = example_poisson_rt_2class() tmp2 = tmp %>% poisson_locfit_model() if(interactive()) { plot_incidence(tmp2,tmp,size=0.25) }
Plot an infectivity profile
plot_ip(ip = i_empirical_ip, alpha = NULL, ...)plot_ip(ip = i_empirical_ip, alpha = NULL, ...)
ip |
A long format infectivity profile - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
alpha |
the alpha value of the bootstrap lines |
... |
passed onto geom_segment for controlling line thickness, alpha etc. |
an ggplot object
if(interactive()) { plot_ip(example_ganyani_ip()) }if(interactive()) { plot_ip(example_ganyani_ip()) }
A multinomial proportions model will tell you what proportion each class has versus others in the data set. In this case the denominator is the total count across across all classes.
plot_multinomial( modelled = i_multinomial_proportion_model, ..., mapping = ggplot2::aes(fill = class), events = i_events, normalise = FALSE )plot_multinomial( modelled = i_multinomial_proportion_model, ..., mapping = ggplot2::aes(fill = class), events = i_events, normalise = FALSE )
modelled |
the multinomial count data - a dataframe with columns:
Must be grouped by: class (exactly). |
... |
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
normalise |
make sure the probabilities add up to one - this can be a bad idea if you know you may have missing values, on the other hand not all proportions models are guaranteed to add up to one. |
a ggplot
tmp = example_proportion_age_stratified() %>% dplyr::group_by(class) %>% dplyr::glimpse() if(interactive()) { plot_multinomial(tmp, normalise=TRUE)+ ggplot2::scale_fill_viridis_d() }tmp = example_proportion_age_stratified() %>% dplyr::group_by(class) %>% dplyr::glimpse() if(interactive()) { plot_multinomial(tmp, normalise=TRUE)+ ggplot2::scale_fill_viridis_d() }
plot_prevalence( modelled = i_prevalence_model, raw = i_proportion_data, ..., mapping = .check_for_aes(modelled, ...), events = i_events )plot_prevalence( modelled = i_prevalence_model, raw = i_proportion_data, ..., mapping = .check_for_aes(modelled, ...), events = i_events )
modelled |
Prevalence estimates - a dataframe with columns:
Any grouping allowed. |
raw |
Raw proportion data - a dataframe with columns:
Any grouping allowed. |
... |
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot object
if(interactive()) { plot_prevalence( ukc19::ons_infection_survey %>% dplyr::mutate(time = as.time_period(date,"1 day")), mapping = ggplot2::aes(colour=name)) }if(interactive()) { plot_prevalence( ukc19::ons_infection_survey %>% dplyr::mutate(time = as.time_period(date,"1 day")), mapping = ggplot2::aes(colour=name)) }
Plot a proportions timeseries
plot_proportion( modelled = i_proportion_model, raw = i_proportion_data, ..., mapping = .check_for_aes(modelled, ...), events = i_events )plot_proportion( modelled = i_proportion_model, raw = i_proportion_data, ..., mapping = .check_for_aes(modelled, ...), events = i_events )
modelled |
Proportion model estimates - a dataframe with columns:
Any grouping allowed. |
raw |
Raw count data with denominator - a dataframe with columns:
Any grouping allowed. |
... |
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot object
tmp = example_poisson_rt_2class() %>% proportion_locfit_model(window=21) %>% dplyr::glimpse() if(interactive()) { plot_proportion(tmp)+ ggplot2::scale_fill_viridis_d(aesthetics = c("fill","colour")) }tmp = example_poisson_rt_2class() %>% proportion_locfit_model(window=21) %>% dplyr::glimpse() if(interactive()) { plot_proportion(tmp)+ ggplot2::scale_fill_viridis_d(aesthetics = c("fill","colour")) }
Plot a raw case count proportion timeseries
plot_proportions_data( raw = i_proportion_data, ..., mapping = .check_for_aes(raw, ...), events = i_events )plot_proportions_data( raw = i_proportion_data, ..., mapping = .check_for_aes(raw, ...), events = i_events )
raw |
The raw count and denominator data - a dataframe with columns:
Any grouping allowed. |
... |
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot object
tmp = example_england_covid_by_age() %>% dplyr::filter(class %in% c("50_54","80_84")) if(interactive()) { plot_proportions_data(tmp, mapping= ggplot2::aes(colour=class))+ggplot2::geom_line() }tmp = example_england_covid_by_age() %>% dplyr::filter(class %in% c("50_54","80_84")) if(interactive()) { plot_proportions_data(tmp, mapping= ggplot2::aes(colour=class))+ggplot2::geom_line() }
Reproduction number timeseries diagram
plot_rt( modelled = i_reproduction_number, ..., mapping = .check_for_aes(modelled, ...), events = i_events )plot_rt( modelled = i_reproduction_number, ..., mapping = .check_for_aes(modelled, ...), events = i_events )
modelled |
the modelled Rt estimate - a dataframe with columns:
Any grouping allowed. |
... |
Named arguments passed on to
Named arguments passed on to
|
mapping |
a |
events |
Significant events or time spans - a dataframe with columns:
Any grouping allowed. A default value is defined. |
a ggplot timeseries
# example code if (interactive()) { tmp2 = example_poisson_locfit() %>% dplyr::filter(as.Date(time) >= "2021-01-01" & as.Date(time) < "2022-01-01") %>% rt_from_incidence(ip = example_ganyani_ip()) # comparing RT from growth rates with England consensus Rt # (N.B. offset by 14 days to align with estimates): plot_rt(tmp2,colour="blue")+ ggplot2::geom_errorbar( data= ukc19::spim_consensus %>% dplyr::filter(date-14 >= "2021-01-01" & date-14 < "2022-01-01"), mapping=ggplot2::aes(x=date-14,ymin=rt.low,ymax=rt.high), colour="red") }# example code if (interactive()) { tmp2 = example_poisson_locfit() %>% dplyr::filter(as.Date(time) >= "2021-01-01" & as.Date(time) < "2022-01-01") %>% rt_from_incidence(ip = example_ganyani_ip()) # comparing RT from growth rates with England consensus Rt # (N.B. offset by 14 days to align with estimates): plot_rt(tmp2,colour="blue")+ ggplot2::geom_errorbar( data= ukc19::spim_consensus %>% dplyr::filter(date-14 >= "2021-01-01" & date-14 < "2022-01-01"), mapping=ggplot2::aes(x=date-14,ymin=rt.low,ymax=rt.high), colour="red") }
Density, distribution function, quantile function and random
generation for the negative binomial distribution with parameters
size and prob.
pnbinom2(q, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)pnbinom2(q, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)
q |
vector of quantiles. |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are
|
log.p |
logical; if TRUE, probabilities p are given as log(p). |
The negative binomial distribution with size and
prob has density
for , and .
This represents the number of failures which occur in a sequence of
Bernoulli trials before a target number of successes is reached.
The mean is and variance .
A negative binomial distribution can also arise as a mixture of
Poisson distributions with mean distributed as a gamma distribution
(see pgamma) with scale parameter (1 - prob)/prob
and shape parameter size. (This definition allows non-integer
values of size.)
An alternative parametrization (often used in ecology) is by the
mean mu (see above), and size, the dispersion
parameter, where prob = size/(size+mu). The variance
is mu + mu^2/size in this parametrization.
If an element of x is not integer, the result of dnbinom
is zero, with a warning.
The case size == 0 is the distribution concentrated at zero.
This is the limiting distribution for size approaching zero,
even if mu rather than prob is held constant. Notice
though, that the mean of the limit distribution is 0, whatever the
value of mu.
The quantile is defined as the smallest value such that
, where is the distribution function.
dnbinom gives the density,
pnbinom gives the distribution function,
qnbinom gives the quantile function, and
rnbinom generates random deviates.
Invalid size or prob will result in return value
NaN, with a warning.
The length of the result is determined by n for
rnbinom, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rnbinom returns a vector of type integer unless generated
values exceed the maximum representable integer when double
values are returned.
dnbinom computes via binomial probabilities, using code
contributed by Catherine Loader (see dbinom).
pnbinom uses pbeta.
qnbinom uses the Cornish–Fisher Expansion to include a skewness
correction to a normal approximation, followed by a search.
rnbinom uses the derivation as a gamma mixture of Poisson
distributions, see
Devroye, L. (1986) Non-Uniform Random Variate Generation. Springer-Verlag, New York. Page 480.
Distributions for standard distributions, including
dbinom for the binomial, dpois for the
Poisson and dgeom for the geometric distribution, which
is a special case of the negative binomial.
pnbinom2(0:5, 2, sqrt(2))pnbinom2(0:5, 2, sqrt(2))
Null distributions always returns NA
pnull(q, ...)pnull(q, ...)
q |
vector of quantiles |
... |
not used |
pnull(c(0.25,0.5,0.75), 0.5, 0.25)pnull(c(0.25,0.5,0.75), 0.5, 0.25)
This function lets the user supply a fitting function that models incidence, and provides a set of machinery for applying it to groups, extracting incidence, growth rates, and optionally reproduction numbers from the fit(s). This is more advanced than other estimators as there are far more configuration of GAM models, so this aims to provide sensible defaults with the option to bring your own model.
poisson_gam_model( d, ..., frequency = "1 day", ip = i_discrete_ip, quick = FALSE, .progress = interactive() )poisson_gam_model( d, ..., frequency = "1 day", ip = i_discrete_ip, quick = FALSE, .progress = interactive() )
d |
input data - EITHER: a dataframe with columns:
Minimally grouped by: obs_time (and other groupings allowed). OR with columns:
Any grouping allowed. |
... |
Named arguments passed on to
Named arguments passed on to
|
frequency |
the density of the output estimates as a time period such as
|
ip |
An infectivity profile (optional) if not given (the default) the Rt value will not be estimated - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
quick |
if |
.progress |
show a CLI progress bar |
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
incidence.fit (double) - an estimate of the incidence rate on a log scale
incidence.se.fit (positive_double) - the standard error of the incidence rate estimate on a log scale
incidence.0.025 (positive_double) - lower confidence limit of the incidence rate (true scale)
incidence.0.5 (positive_double) - median estimate of the incidence rate (true scale)
incidence.0.975 (positive_double) - upper confidence limit of the incidence rate (true scale)
growth.fit (double) - an estimate of the growth rate
growth.se.fit (positive_double) - the standard error the growth rate
growth.0.025 (double) - lower confidence limit of the growth rate
growth.0.5 (double) - median estimate of the growth rate
growth.0.975 (double) - upper confidence limit of the growth rate
Any grouping allowed.
additionally if ip is given: A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
# Simple poisson model data = example_poisson_rt_smooth() tmp2 = poisson_gam_model(data,window=7,ip=example_ip(),quick=TRUE) if (interactive()) { plot_incidence( tmp2, date_labels="%b %y", raw=data ) plot_rt( tmp2, date_labels="%b %y" )+ sim_geom_function(data,colour="red") } # example with delayed observation model. # This data is all the day by day observations of the whole timeseries from # the beginning of the outbreak. data2 = example_delayed_observation() model = gam_delayed_reporting(window = 14) tmp3 = data2 %>% poisson_gam_model( model_fn = model$model_fn, predict = model$predict, ip=example_ip()) if (interactive()) { plot_incidence(tmp3)+ ggplot2::geom_line( data=data2 %>% dplyr::filter(obs_time %% 10 == 0), mapping = ggplot2::aes(x=as.Date(time),y=count,colour=as.factor(obs_time)) ) plot_rt(tmp3) }# Simple poisson model data = example_poisson_rt_smooth() tmp2 = poisson_gam_model(data,window=7,ip=example_ip(),quick=TRUE) if (interactive()) { plot_incidence( tmp2, date_labels="%b %y", raw=data ) plot_rt( tmp2, date_labels="%b %y" )+ sim_geom_function(data,colour="red") } # example with delayed observation model. # This data is all the day by day observations of the whole timeseries from # the beginning of the outbreak. data2 = example_delayed_observation() model = gam_delayed_reporting(window = 14) tmp3 = data2 %>% poisson_gam_model( model_fn = model$model_fn, predict = model$predict, ip=example_ip()) if (interactive()) { plot_incidence(tmp3)+ ggplot2::geom_line( data=data2 %>% dplyr::filter(obs_time %% 10 == 0), mapping = ggplot2::aes(x=as.Date(time),y=count,colour=as.factor(obs_time)) ) plot_rt(tmp3) }
This uses a generalised linear model to fit a quasi-poisson model with a time
varying rate as a natural cubic spline with approx one degree of freedom per
window units of the time series.
poisson_glm_model( d = i_incidence_input, ..., window = 14, frequency = "1 day", .progress = interactive() )poisson_glm_model( d = i_incidence_input, ..., window = 14, frequency = "1 day", .progress = interactive() )
d |
Count model input - a dataframe with columns:
Any grouping allowed. |
... |
not used and present to allow proportion model to be used in a
|
window |
a number of data points defining the bandwidth of the estimate,
smaller values result in less smoothing, large value in more. The default
value of 14 is calibrated for data provided on a daily frequency, with
weekly data a lower value may be preferred. - default |
frequency |
the density of the output estimates as a time period such as
|
.progress |
show a CLI progress bar |
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
incidence.fit (double) - an estimate of the incidence rate on a log scale
incidence.se.fit (positive_double) - the standard error of the incidence rate estimate on a log scale
incidence.0.025 (positive_double) - lower confidence limit of the incidence rate (true scale)
incidence.0.5 (positive_double) - median estimate of the incidence rate (true scale)
incidence.0.975 (positive_double) - upper confidence limit of the incidence rate (true scale)
Any grouping allowed.
data = example_poisson_growth_rate() tmp2 = data %>% poisson_glm_model(window=7,deg=2) tmp3 = data %>% poisson_glm_model(window=14,deg=1) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(class="7:2"), tmp3 %>% dplyr::mutate(class="14:1"), ) %>% dplyr::group_by(class) if (interactive()) { plot_incidence(comp, date_labels="%b %y", raw=data, true_col=rate) }data = example_poisson_growth_rate() tmp2 = data %>% poisson_glm_model(window=7,deg=2) tmp3 = data %>% poisson_glm_model(window=14,deg=1) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(class="7:2"), tmp3 %>% dplyr::mutate(class="14:1"), ) %>% dplyr::group_by(class) if (interactive()) { plot_incidence(comp, date_labels="%b %y", raw=data, true_col=rate) }
Takes a list of times and counts and fits a quasi-poisson model
fitted with a log link function to count data using local regression
using the package locfit.
poisson_locfit_model( d = i_incidence_input, ..., window = 14, deg = 2, frequency = "1 day", predict = TRUE, ip = i_discrete_ip, quick = FALSE, .progress = interactive() )poisson_locfit_model( d = i_incidence_input, ..., window = 14, deg = 2, frequency = "1 day", predict = TRUE, ip = i_discrete_ip, quick = FALSE, .progress = interactive() )
d |
input data - a dataframe with columns:
Any grouping allowed. |
... |
not used and present to allow proportion model to be used in a
|
window |
a number of data points defining the bandwidth of the estimate,
smaller values result in less smoothing, large value in more. The default
value of 14 is calibrated for data provided on a daily frequency, with
weekly data a lower value may be preferred. - default |
deg |
polynomial degree (min 1) - higher degree results in less
smoothing, lower values result in more smoothing. A degree of 1 is fitting
a linear model piece wise. - default |
frequency |
the density of the output estimates as a time period such as
|
predict |
result is a prediction dataframe. If false we return the
|
ip |
An infectivity profile (optional) if not given (the default) the Rt value will not be estimated - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
quick |
if |
.progress |
show a CLI progress bar |
This results is an incidence rate estimate plus an absolute exponential growth rate estimate both based on the time unit of the input data (e.g. for daily data the rate will be cases per day and the growth rate will be daily).
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
incidence.fit (double) - an estimate of the incidence rate on a log scale
incidence.se.fit (positive_double) - the standard error of the incidence rate estimate on a log scale
incidence.0.025 (positive_double) - lower confidence limit of the incidence rate (true scale)
incidence.0.5 (positive_double) - median estimate of the incidence rate (true scale)
incidence.0.975 (positive_double) - upper confidence limit of the incidence rate (true scale)
growth.fit (double) - an estimate of the growth rate
growth.se.fit (positive_double) - the standard error the growth rate
growth.0.025 (double) - lower confidence limit of the growth rate
growth.0.5 (double) - median estimate of the growth rate
growth.0.975 (double) - upper confidence limit of the growth rate
Any grouping allowed.
data = example_poisson_rt() tmp = data %>% poisson_locfit_model(window=14,deg=2, ip=example_ip(), quick=TRUE) plot_rt(tmp, raw = data, true_col = rt) data = example_poisson_growth_rate() tmp2 = data %>% poisson_locfit_model(window=7,deg=1) tmp3 = data %>% poisson_locfit_model(window=14,deg=2) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(class="7:1"), tmp3 %>% dplyr::mutate(class="14:2"), ) %>% dplyr::group_by(class) if (interactive()) { plot_incidence( comp, date_labels="%b %y", raw=data, true_col = rate ) plot_growth_rate( comp, date_labels="%b %y", raw = data, true_col = growth ) # sim_geom_function(data,colour="black") }data = example_poisson_rt() tmp = data %>% poisson_locfit_model(window=14,deg=2, ip=example_ip(), quick=TRUE) plot_rt(tmp, raw = data, true_col = rt) data = example_poisson_growth_rate() tmp2 = data %>% poisson_locfit_model(window=7,deg=1) tmp3 = data %>% poisson_locfit_model(window=14,deg=2) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(class="7:1"), tmp3 %>% dplyr::mutate(class="14:2"), ) %>% dplyr::group_by(class) if (interactive()) { plot_incidence( comp, date_labels="%b %y", raw=data, true_col = rate ) plot_growth_rate( comp, date_labels="%b %y", raw = data, true_col = growth ) # sim_geom_function(data,colour="black") }
This uses a generalised linear model to fit a quasi-binomial model with a time
varying rate as a natural cubic spline with approx one degree of freedom per
window units of the time series.
proportion_glm_model( d = i_proportion_input, ..., window = 14, frequency = "1 day", .progress = interactive() )proportion_glm_model( d = i_proportion_input, ..., window = 14, frequency = "1 day", .progress = interactive() )
d |
Proportion model input - a dataframe with columns:
Any grouping allowed. |
... |
not used and present to allow proportion model to be used in a
|
window |
a number of data points defining the bandwidth of the estimate,
smaller values result in less smoothing, large value in more. The default
value of 14 is calibrated for data provided on a daily frequency, with
weekly data a lower value may be preferred. - default |
frequency |
the density of the output estimates as a time period such as
|
.progress |
show a CLI progress bar |
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
proportion.fit (double) - an estimate of the proportion on a logit scale
proportion.se.fit (positive_double) - the standard error of proportion estimate on a logit scale
proportion.0.025 (proportion) - lower confidence limit of proportion (true scale)
proportion.0.5 (proportion) - median estimate of proportion (true scale)
proportion.0.975 (proportion) - upper confidence limit of proportion (true scale)
Any grouping allowed.
data = example_poisson_rt_2class() tmp2 = data %>% proportion_glm_model(window=7,deg=2) tmp3 = data %>% proportion_glm_model(window=14,deg=1) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(model="7:2"), tmp3 %>% dplyr::mutate(model="14:1"), ) %>% dplyr::group_by(model,class) if (interactive()) { plot_proportion( comp, date_labels="%b %y", mapping=ggplot2::aes(colour=model), raw=data )+ ggplot2::facet_wrap(~class) } # TODO: deal with error conditions # "observations with zero weight not used for calculating dispersiondata = example_poisson_rt_2class() tmp2 = data %>% proportion_glm_model(window=7,deg=2) tmp3 = data %>% proportion_glm_model(window=14,deg=1) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(model="7:2"), tmp3 %>% dplyr::mutate(model="14:1"), ) %>% dplyr::group_by(model,class) if (interactive()) { plot_proportion( comp, date_labels="%b %y", mapping=ggplot2::aes(colour=model), raw=data )+ ggplot2::facet_wrap(~class) } # TODO: deal with error conditions # "observations with zero weight not used for calculating dispersion
takes a list of times, counts and a denominator and fits a quasi-binomial model
using a logit link function to proportion data using local regression
using the package locfit.
proportion_locfit_model( d = i_proportion_input, ..., window = 14, deg = 1, frequency = "1 day", predict = TRUE, .progress = interactive() )proportion_locfit_model( d = i_proportion_input, ..., window = 14, deg = 1, frequency = "1 day", predict = TRUE, .progress = interactive() )
d |
the input - a dataframe with columns:
Any grouping allowed. |
... |
not used and present to allow proportion model to be used in a
|
window |
a number of data points defining the bandwidth of the estimate,
smaller values result in less smoothing, large value in more. The default
value of 14 is calibrated for data provided on a daily frequency, with
weekly data a lower value may be preferred. - default |
deg |
polynomial degree (min 1) - higher degree results in less
smoothing, lower values result in more smoothing. A degree of 1 is fitting
a linear model piece wise. - default |
frequency |
the density of the output estimates as a time period such as
|
predict |
result is a prediction dataframe. If false we return the
|
.progress |
show a CLI progress bar |
This expects d to contain one combination of:
time and count and denom columns - e.g. all tests conducted.
This results is a one versus others comparison binomial proportion estimate plus a relative growth rate estimate specifying how much quicker this is growing compared to the growth of the denominator.
The denominator maybe the sum of all subgroups denom = sum(count), e.g. in
the situation where there are multiple variants of a disease circulating. In
which case the relative growth is that of the subgroup compared to the
overall. You can make this a one-versus-others comparison by making the
denominator exclude the current item (e.g. denom = sum(count)-count).
The denominator can also be used to express the size of the population tested. This gives us a relative growth rate that is different in essence to the previous and may be a better estimate of the true growth rate in the situation where testing effort is variable, or capacity saturated.
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
proportion.fit (double) - an estimate of the proportion on a logit scale
proportion.se.fit (positive_double) - the standard error of proportion estimate on a logit scale
proportion.0.025 (proportion) - lower confidence limit of proportion (true scale)
proportion.0.5 (proportion) - median estimate of proportion (true scale)
proportion.0.975 (proportion) - upper confidence limit of proportion (true scale)
relative.growth.fit (double) - an estimate of the relative growth rate
relative.growth.se.fit (positive_double) - the standard error the relative growth rate
relative.growth.0.025 (double) - lower confidence limit of the relative growth rate
relative.growth.0.5 (double) - median estimate of the relative growth rate
relative.growth.0.975 (double) - upper confidence limit of the relative growth rate
Any grouping allowed.
data = example_poisson_rt_2class() tmp2 = data %>% proportion_locfit_model(window=7,deg=2) tmp3 = data %>% proportion_locfit_model(window=14,deg=1) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(model="7:2"), tmp3 %>% dplyr::mutate(model="14:1"), ) %>% dplyr::group_by(model,class) if (interactive()) { plot_proportion( comp, date_labels="%b %y", mapping=ggplot2::aes(colour=model), raw=data )+ggplot2::facet_wrap(~class) }data = example_poisson_rt_2class() tmp2 = data %>% proportion_locfit_model(window=7,deg=2) tmp3 = data %>% proportion_locfit_model(window=14,deg=1) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(model="7:2"), tmp3 %>% dplyr::mutate(model="14:1"), ) %>% dplyr::group_by(model,class) if (interactive()) { plot_proportion( comp, date_labels="%b %y", mapping=ggplot2::aes(colour=model), raw=data )+ggplot2::facet_wrap(~class) }
The wedge distribution has a support of 0 to 1 and has a linear probability density function over that support.
pwedge(q, a, log.p = FALSE)pwedge(q, a, log.p = FALSE)
q |
vector of quantiles |
a |
a gradient from -2 (left skewed) to 2 (right skewed) |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
The rwedge can be combined with quantile functions to
skew standard distributions, or introduce correlation or down weight
certain parts of the distribution.
a vector of probabilities, quantiles, densities or samples.
pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )
Density, distribution function, quantile function and random
generation for the Beta distribution with parameters shape1 and
shape2 (and optional non-centrality parameter ncp).
qbeta2(p, prob, kappa, lower.tail = TRUE, log.p = FALSE)qbeta2(p, prob, kappa, lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities |
prob |
the mean probability (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
dbeta gives the density, pbeta the distribution
function, qbeta the quantile function, and rbeta
generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rbeta, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
qbeta2(c(0.25,0.5,0.75), 0.5, 0.25)qbeta2(c(0.25,0.5,0.75), 0.5, 0.25)
The following conversion describes the parameters mean and kappa
qcgamma(p, mean, kappa = 1/mean, lower.tail = TRUE, log.p = FALSE)qcgamma(p, mean, kappa = 1/mean, lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities |
mean |
the mean value on the true scale (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
qcgamma(c(0.25,0.5,0.75), 2, 0.5)qcgamma(c(0.25,0.5,0.75), 2, 0.5)
Density, distribution function, quantile function and random
generation for the Gamma distribution with parameters shape and
scale.
qgamma2(p, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)qgamma2(p, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
qgamma2(c(0.25,0.5,0.75), 2, 1)qgamma2(c(0.25,0.5,0.75), 2, 1)
Density, distribution function, quantile function and random
generation for the log normal distribution whose logarithm has mean
equal to meanlog and standard deviation equal to sdlog.
qlnorm2(p, mean = 1, sd = sqrt(exp(1) - 1), lower.tail = TRUE, log.p = FALSE)qlnorm2(p, mean = 1, sd = sqrt(exp(1) - 1), lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities. |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are
|
log.p |
logical; if TRUE, probabilities p are given as log(p). |
The log normal distribution has density
where and are the mean and standard
deviation of the logarithm.
The mean is ,
the median is , and the variance
and hence the coefficient of variation is
which is
approximately when that is small (e.g., ).
dlnorm gives the density,
plnorm gives the distribution function,
qlnorm gives the quantile function, and
rlnorm generates random deviates.
The length of the result is determined by n for
rlnorm, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
The cumulative hazard
is -plnorm(t, r, lower = FALSE, log = TRUE).
dlnorm is calculated from the definition (in ‘Details’).
[pqr]lnorm are based on the relationship to the normal.
Consequently, they model a single point mass at exp(meanlog)
for the boundary case sdlog = 0.
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. Wadsworth & Brooks/Cole.
Johnson, N. L., Kotz, S. and Balakrishnan, N. (1995) Continuous Univariate Distributions, volume 1, chapter 14. Wiley, New York.
Distributions for other standard distributions, including
dnorm for the normal distribution.
qlnorm2(c(0.25,0.5,0.72), 2, 1)qlnorm2(c(0.25,0.5,0.72), 2, 1)
The logit-normal distribution has a support of 0 to 1.
qlogitnorm(p, meanlogit = 0, sdlogit = 1, lower.tail = TRUE, log.p = FALSE)qlogitnorm(p, meanlogit = 0, sdlogit = 1, lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities |
meanlogit |
the mean on the logit scale |
sdlogit |
the sd on the logit scale |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
a vector of probabilities, quantiles, densities or samples.
qlogitnorm(c(0.25,0.5,0.75), 0, 1)qlogitnorm(c(0.25,0.5,0.75), 0, 1)
The logit-normal distribution has a support of 0 to 1.
qlogitnorm2( p, prob.0.5 = 0.5, kappa = 1 - exp(-1), lower.tail = TRUE, log.p = FALSE )qlogitnorm2( p, prob.0.5 = 0.5, kappa = 1 - exp(-1), lower.tail = TRUE, log.p = FALSE )
p |
vector of probabilities |
prob.0.5 |
the median on the true scale |
kappa |
a dispersion parameter from 0 (none) to 1 maximum dispersion |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
a vector of probabilities, quantiles, densities or samples.
qlnorm2(c(0.25,0.5,0.72), 2, 1)qlnorm2(c(0.25,0.5,0.72), 2, 1)
Density, distribution function, quantile function and random
generation for the negative binomial distribution with parameters
size and prob.
qnbinom2(p, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)qnbinom2(p, mean, sd = sqrt(mean), lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities. |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
lower.tail |
logical; if TRUE (default), probabilities are
|
log.p |
logical; if TRUE, probabilities p are given as log(p). |
The negative binomial distribution with size and
prob has density
for , and .
This represents the number of failures which occur in a sequence of
Bernoulli trials before a target number of successes is reached.
The mean is and variance .
A negative binomial distribution can also arise as a mixture of
Poisson distributions with mean distributed as a gamma distribution
(see pgamma) with scale parameter (1 - prob)/prob
and shape parameter size. (This definition allows non-integer
values of size.)
An alternative parametrization (often used in ecology) is by the
mean mu (see above), and size, the dispersion
parameter, where prob = size/(size+mu). The variance
is mu + mu^2/size in this parametrization.
If an element of x is not integer, the result of dnbinom
is zero, with a warning.
The case size == 0 is the distribution concentrated at zero.
This is the limiting distribution for size approaching zero,
even if mu rather than prob is held constant. Notice
though, that the mean of the limit distribution is 0, whatever the
value of mu.
The quantile is defined as the smallest value such that
, where is the distribution function.
dnbinom gives the density,
pnbinom gives the distribution function,
qnbinom gives the quantile function, and
rnbinom generates random deviates.
Invalid size or prob will result in return value
NaN, with a warning.
The length of the result is determined by n for
rnbinom, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rnbinom returns a vector of type integer unless generated
values exceed the maximum representable integer when double
values are returned.
dnbinom computes via binomial probabilities, using code
contributed by Catherine Loader (see dbinom).
pnbinom uses pbeta.
qnbinom uses the Cornish–Fisher Expansion to include a skewness
correction to a normal approximation, followed by a search.
rnbinom uses the derivation as a gamma mixture of Poisson
distributions, see
Devroye, L. (1986) Non-Uniform Random Variate Generation. Springer-Verlag, New York. Page 480.
Distributions for standard distributions, including
dbinom for the binomial, dpois for the
Poisson and dgeom for the geometric distribution, which
is a special case of the negative binomial.
qnbinom2(c(0.25,0.5,0.75), 5, sqrt(5))qnbinom2(c(0.25,0.5,0.75), 5, sqrt(5))
Null distributions always returns NA
qnull(p, ...)qnull(p, ...)
p |
vector of probabilities |
... |
not used |
qnull(c(0.25,0.5,0.75), 0.5, 0.25)qnull(c(0.25,0.5,0.75), 0.5, 0.25)
A specific parameter or set of parameters can be estimated by a pipeline.
This function applies the pipeline to a synthetic epidemic with sawtooth
incidence resulting from a stepped growth rate function. The lag between
synthetic input and estimate is assessed by minimising the root mean square
error of input and estimated based on different lag offsets.
quantify_lag(pipeline, ip = i_empirical_ip, lags = -10:30)quantify_lag(pipeline, ip = i_empirical_ip, lags = -10:30)
pipeline |
a function taking an input dataset and an infectivity profile
as inputs and producing an estimate as output. This is the whole
parametrised pipeline including any other inputs. This can be a |
ip |
the infectivity profile. - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
lags |
a vector with the delays to test. Defaults to -10 to +30 days |
a lag analysis dataframe containing the estimate type and the lag
in days that the estimate is behind the actual observation
set_default_start("2025-01-01") # lags from a locfit incidence model with Rt estimation. # This model has no estimator lag: pipeline = ~ .x %>% poisson_locfit_model() %>% rt_from_incidence(ip = .y) quantify_lag(pipeline, ip = example_ip()) # lags from an epiestim Rt estimation # this model's lags depend on the infectivity profile. # In this case it is 8 days pipeline2 = ~ .x %>% rt_epiestim(ip = .y) quantify_lag(pipeline2, ip=example_ip() )set_default_start("2025-01-01") # lags from a locfit incidence model with Rt estimation. # This model has no estimator lag: pipeline = ~ .x %>% poisson_locfit_model() %>% rt_from_incidence(ip = .y) quantify_lag(pipeline, ip = example_ip()) # lags from an epiestim Rt estimation # this model's lags depend on the infectivity profile. # In this case it is 8 days pipeline2 = ~ .x %>% rt_epiestim(ip = .y) quantify_lag(pipeline2, ip=example_ip() )
Extract the weekday, month or quarter, or the Julian time (days since some origin). These are generic functions: the methods for the internal date-time classes are documented here.
## S3 method for class 'time_period' quarters(x, ...)## S3 method for class 'time_period' quarters(x, ...)
x |
an object inheriting from class |
... |
arguments for other methods. |
weekdays and months return a character
vector of names in the locale in use, i.e., Sys.getlocale("LC_TIME").
quarters returns a character vector of "Q1" to
"Q4".
julian returns the number of days (possibly fractional)
since the origin, with the origin as a "origin" attribute.
All time calculations in R are done ignoring leap-seconds.
Other components such as the day of the month or the year are
very easy to compute: just use as.POSIXlt and extract
the relevant component. Alternatively (especially if the components
are desired as character strings), use strftime.
DateTimeClasses, Date;
Sys.getlocale("LC_TIME") crucially for months() and weekdays().
## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time
The wedge distribution has a support of 0 to 1 and has a linear probability density function over that support.
qwedge(p, a, lower.tail = TRUE, log.p = FALSE)qwedge(p, a, lower.tail = TRUE, log.p = FALSE)
p |
vector of probabilities |
a |
a gradient from -2 (left skewed) to 2 (right skewed) |
lower.tail |
logical; if TRUE (default), probabilities are |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
The rwedge can be combined with quantile functions to
skew standard distributions, or introduce correlation or down weight
certain parts of the distribution.
a vector of probabilities, quantiles, densities or samples.
pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )
A random Bernoulli sample as a logical value
rbern(n, prob)rbern(n, prob)
n |
number of observations |
prob |
the mean probability (vectorised) |
a vector of logical values of size n
table(rbern(100, 0.25))table(rbern(100, 0.25))
Density, distribution function, quantile function and random
generation for the Beta distribution with parameters shape1 and
shape2 (and optional non-centrality parameter ncp).
rbeta2(n, prob, kappa)rbeta2(n, prob, kappa)
n |
number of observations |
prob |
the mean probability (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
dbeta gives the density, pbeta the distribution
function, qbeta the quantile function, and rbeta
generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rbeta, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rbeta2(3, c(0.1,0.5,0.9),0.1)rbeta2(3, c(0.1,0.5,0.9),0.1)
Sampling from the multinomial equivalent of the Bernoulli distribution
rcategorical(n, prob, factor = FALSE)rcategorical(n, prob, factor = FALSE)
n |
a sample size |
prob |
a (optionally named) vector of probabilities that will be normalised to sum to 1 |
factor |
if not FALSE then factor levels will either be taken from the
names of |
a vector of random class labels of length n. Labels come from names
of prob or from a character vector in factor.
prob = c("one"=0.1,"two"=0.2,"seven"=0.7) table(rcategorical(1000,prob)) rcategorical(10,prob,factor=TRUE) rcategorical(10,rep(1,26),factor=letters)prob = c("one"=0.1,"two"=0.2,"seven"=0.7) table(rcategorical(1000,prob)) rcategorical(10,prob,factor=TRUE) rcategorical(10,rep(1,26),factor=letters)
The following conversion describes the parameters mean and kappa
rcgamma(n, mean, kappa = 1/mean)rcgamma(n, mean, kappa = 1/mean)
n |
number of observations |
mean |
the mean value on the true scale (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rcgamma(10, 2, 0.5)rcgamma(10, 2, 0.5)
For count data that is under-dispersed we can use a gamma distribution rounded
to the nearest whole number. This is the same as the method of discretisation
in make_gamma_ip, and suits delay distributions where there is less variability
than can be represented with a Poisson or negative binomial distribution.
rdiscgamma(n, mean, sd, kappa)rdiscgamma(n, mean, sd, kappa)
n |
number of observations |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
kappa |
a coefficient of variation. where 0 is no variability and 1 is maximally variability (vectorised) |
an integer valued vector from a gamma distribution.
rdiscgamma(10, 2, 1)rdiscgamma(10, 2, 1)
e.g. age banded population, or a discrete probability distribution e.g. a serial interval distribution. This method fits a monotonically increasing spline to the cumulative distribution (including the upper and lower limits) and interpolating using that spline to the new cut points.
reband_discrete( x, y, xout, xlim = c(0, NA), ytotal = c(0, sum(y)), digits = 0, labelling = c("positive_integer", "inclusive", "exclusive"), sep = "-" )reband_discrete( x, y, xout, xlim = c(0, NA), ytotal = c(0, sum(y)), digits = 0, labelling = c("positive_integer", "inclusive", "exclusive"), sep = "-" )
x |
a set of upper limits of bands, e.g. for age: 0-14;15-64;65-79;80+ is 15,65,80,NA |
y |
a set of quantities for each band e.g. population figures |
xout |
a set of new upper limits |
xlim |
Upper and lower limits for x. if the last band is e.g 80+ in the input and we want to know the 85+ band in the output some kind of maximum upper limit is needed to interpolate to. |
ytotal |
upper and lower limits for y. If the interpolation values fall outside of
x then the minimum and maximum limits of y are given by this. This would be |
digits |
if the |
labelling |
are the |
sep |
separator for names e.g. |
a re-banded set of discrete values, guaranteed to sum to the same as y
england_demographics = ukc19::uk_population_2019_by_10yr_age %>% dplyr::filter(name=="England") ul = stringr::str_extract(england_demographics$class, "_([0-9]+)",group = 1) %>% as.numeric() # ul is currently inclusive so add 1: ul = ul + 1 tmp = reband_discrete( ul, england_demographics$population, c(5,10,15,40,80), xlim=c(0,120)) tmp sum(tmp) sum(england_demographics$population)england_demographics = ukc19::uk_population_2019_by_10yr_age %>% dplyr::filter(name=="England") ul = stringr::str_extract(england_demographics$class, "_([0-9]+)",group = 1) %>% as.numeric() # ul is currently inclusive so add 1: ul = ul + 1 tmp = reband_discrete( ul, england_demographics$population, c(5,10,15,40,80), xlim=c(0,120)) tmp sum(tmp) sum(england_demographics$population)
Sometimes we may have, for example, modelled incidence or growth rates on weekly data resulting in cases per week and growth rate per week. We may wish to use this to estimate the reproduction number, using algorithms that assume a daily incidence. Not everything has a dependence on time, and things such as proportions, or prevalence will not change.
rescale_model(df = i_timeseries, time_unit)rescale_model(df = i_timeseries, time_unit)
df |
A data frame containing modelled output. This will modify the following columns if present: A dataframe containing the following columns:
Any grouping allowed. A dataframe containing the following columns:
Any grouping allowed. A dataframe containing the following columns:
Any grouping allowed. |
time_unit |
a |
the same time series with different time unit, and adjusted incidence and growth rate figures.
sim = sim_poisson_model(time_unit = "1 week") incidence = sim %>% poisson_locfit_model(frequency = "1 day", deg = 2, window=5) incidence2 = incidence %>% rescale_model(time_unit = "1 day") incidence2 %>% dplyr::glimpse()sim = sim_poisson_model(time_unit = "1 week") incidence = sim %>% poisson_locfit_model(frequency = "1 day", deg = 2, window=5) incidence2 = incidence %>% rescale_model(time_unit = "1 day") incidence2 %>% dplyr::glimpse()
Randomly sample incident times in an exponentially growing process
rexpgrowth(n, r, t_end, t_start = 0)rexpgrowth(n, r, t_end, t_start = 0)
n |
the number of items to sample |
r |
and exponential growth rate (per unit time) |
t_end |
the end of the observation period |
t_start |
the start of the observation period |
a vector of n samples from an exponential growth process
graphics::hist(rexpgrowth(1000,0.1,40), breaks=40) graphics::hist(rexpgrowth(1000,-0.1,40), breaks=40)graphics::hist(rexpgrowth(1000,0.1,40), breaks=40) graphics::hist(rexpgrowth(1000,-0.1,40), breaks=40)
Randomly sample incident times in an exponentially growing process with initial case load
rexpgrowthI0(I0, r, t_end, t_start = 0)rexpgrowthI0(I0, r, t_end, t_start = 0)
I0 |
the expected number of cases observed in the first day |
r |
and exponential growth rate (per unit time) |
t_end |
the end of the observation period |
t_start |
the start of the observation period |
a vector of n samples from an exponential growth process
graphics::hist(rexpgrowthI0(10,0.1,20), breaks=40) graphics::hist(rexpgrowthI0(1000,-0.1,40), breaks=40)graphics::hist(rexpgrowthI0(10,0.1,20), breaks=40) graphics::hist(rexpgrowthI0(1000,-0.1,40), breaks=40)
Density, distribution function, quantile function and random
generation for the Gamma distribution with parameters shape and
scale.
rgamma2(n, mean, sd = sqrt(mean))rgamma2(n, mean, sd = sqrt(mean))
n |
number of observations |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
dgamma gives the density,
pgamma gives the distribution function,
qgamma gives the quantile function, and
rgamma generates random deviates.
Invalid arguments will result in return value NaN, with a warning.
The length of the result is determined by n for
rgamma, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rgamma2(10, 2, 1)rgamma2(10, 2, 1)
Density, distribution function, quantile function and random
generation for the log normal distribution whose logarithm has mean
equal to meanlog and standard deviation equal to sdlog.
rlnorm2(n, mean = 1, sd = sqrt(exp(1) - 1))rlnorm2(n, mean = 1, sd = sqrt(exp(1) - 1))
n |
number of observations. If |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
The log normal distribution has density
where and are the mean and standard
deviation of the logarithm.
The mean is ,
the median is , and the variance
and hence the coefficient of variation is
which is
approximately when that is small (e.g., ).
dlnorm gives the density,
plnorm gives the distribution function,
qlnorm gives the quantile function, and
rlnorm generates random deviates.
The length of the result is determined by n for
rlnorm, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
The cumulative hazard
is -plnorm(t, r, lower = FALSE, log = TRUE).
dlnorm is calculated from the definition (in ‘Details’).
[pqr]lnorm are based on the relationship to the normal.
Consequently, they model a single point mass at exp(meanlog)
for the boundary case sdlog = 0.
Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988) The New S Language. Wadsworth & Brooks/Cole.
Johnson, N. L., Kotz, S. and Balakrishnan, N. (1995) Continuous Univariate Distributions, volume 1, chapter 14. Wiley, New York.
Distributions for other standard distributions, including
dnorm for the normal distribution.
rlnorm2(10, 2, 1)rlnorm2(10, 2, 1)
The logit-normal distribution has a support of 0 to 1.
rlogitnorm(n, meanlogit = 0, sdlogit = 1)rlogitnorm(n, meanlogit = 0, sdlogit = 1)
n |
number of observations |
meanlogit |
the mean on the logit scale |
sdlogit |
the sd on the logit scale |
a vector of probabilities, quantiles, densities or samples.
rlogitnorm(10, 0, 1)rlogitnorm(10, 0, 1)
The logit-normal distribution has a support of 0 to 1.
rlogitnorm2(n, prob.0.5 = 0.5, kappa = 1 - exp(-1))rlogitnorm2(n, prob.0.5 = 0.5, kappa = 1 - exp(-1))
n |
number of observations |
prob.0.5 |
the median on the true scale |
kappa |
a dispersion parameter from 0 (none) to 1 maximum dispersion |
a vector of probabilities, quantiles, densities or samples.
mean(rlogitnorm2(10000,0.75,0.2))mean(rlogitnorm2(10000,0.75,0.2))
Density, distribution function, quantile function and random
generation for the negative binomial distribution with parameters
size and prob.
rnbinom2(n, mean, sd = sqrt(mean))rnbinom2(n, mean, sd = sqrt(mean))
n |
number of observations. If |
mean |
the mean value on the true scale (vectorised) |
sd |
the standard deviation on the true scale (vectorised) |
The negative binomial distribution with size and
prob has density
for , and .
This represents the number of failures which occur in a sequence of
Bernoulli trials before a target number of successes is reached.
The mean is and variance .
A negative binomial distribution can also arise as a mixture of
Poisson distributions with mean distributed as a gamma distribution
(see pgamma) with scale parameter (1 - prob)/prob
and shape parameter size. (This definition allows non-integer
values of size.)
An alternative parametrization (often used in ecology) is by the
mean mu (see above), and size, the dispersion
parameter, where prob = size/(size+mu). The variance
is mu + mu^2/size in this parametrization.
If an element of x is not integer, the result of dnbinom
is zero, with a warning.
The case size == 0 is the distribution concentrated at zero.
This is the limiting distribution for size approaching zero,
even if mu rather than prob is held constant. Notice
though, that the mean of the limit distribution is 0, whatever the
value of mu.
The quantile is defined as the smallest value such that
, where is the distribution function.
dnbinom gives the density,
pnbinom gives the distribution function,
qnbinom gives the quantile function, and
rnbinom generates random deviates.
Invalid size or prob will result in return value
NaN, with a warning.
The length of the result is determined by n for
rnbinom, and is the maximum of the lengths of the
numerical arguments for the other functions.
The numerical arguments other than n are recycled to the
length of the result. Only the first elements of the logical
arguments are used.
rnbinom returns a vector of type integer unless generated
values exceed the maximum representable integer when double
values are returned.
dnbinom computes via binomial probabilities, using code
contributed by Catherine Loader (see dbinom).
pnbinom uses pbeta.
qnbinom uses the Cornish–Fisher Expansion to include a skewness
correction to a normal approximation, followed by a search.
rnbinom uses the derivation as a gamma mixture of Poisson
distributions, see
Devroye, L. (1986) Non-Uniform Random Variate Generation. Springer-Verlag, New York. Page 480.
Distributions for standard distributions, including
dbinom for the binomial, dpois for the
Poisson and dgeom for the geometric distribution, which
is a special case of the negative binomial.
rnbinom2(10, 5, sqrt(5))rnbinom2(10, 5, sqrt(5))
Null distributions always returns NA
rnull(n, ...)rnull(n, ...)
n |
number of observations |
... |
not used |
rnull(3, c(0.1,0.5,0.9),0.1)rnull(3, c(0.1,0.5,0.9),0.1)
Calculate a reproduction number estimate from incidence data using a reimplementation
of the Cori method and an empirical generation time distribution. This uses
a mixture distribution to transmit uncertainty in generation time estimates.
A number of changes compared to the original EpiEstim implementation
have been made. Firstly there is no technical
limitation to the infectivity profile being strictly positive in time. This allows
use of serial intervals and secondary potentially delayed observations.
Secondly this implementation should tolerate missing count values (NA values
must be filtered out though). Thirdly for a given time point t this applies
all Rt estimates for which the window spans time point t rather than end
on time point t, which tends to address lag issues with the original,
and fourthly this implementation allows multiple window
widths to be calculated in parallel and aggregated. All of this tends to
increase uncertainty in the result particularly in the time dimension, which
addresses some of the issue seem with EpiEstim during the pandemic. Finally
it is quite a bit quicker, especially if approximate quantiles are all that
are needed.
rt_cori( df = i_incidence_input, ip = i_discrete_ip, window = 14, mean_prior = 1, std_prior = 2, ..., epiestim_compat = FALSE, approx = FALSE, .progress = interactive() )rt_cori( df = i_incidence_input, ip = i_discrete_ip, window = 14, mean_prior = 1, std_prior = 2, ..., epiestim_compat = FALSE, approx = FALSE, .progress = interactive() )
df |
The count data. Extra groups are allowed. - a dataframe with columns:
Any grouping allowed. |
ip |
A long format infectivity profile. - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
window |
|
mean_prior |
the prior for the $R_t$ estimate. When sample size is low the
$R_t$ estimate will revert to this prior. In |
std_prior |
the prior for the $R_t$ SD. |
... |
not used |
epiestim_compat |
produce an estimate of |
approx |
approximate the quantiles of the mixture distribution with a gamma distribution with the same first mean and SD. |
.progress |
show a CLI progress bar |
There are still issues with large $R_t$ estimates in the early part of a time series, which is a resul tof the renewal equaltion method.
This will calculate a reproduction number for each group in the input dataframe.
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
data = example_poisson_rt_smooth() tmp2 = data %>% rt_cori(ip=example_ip(), epiestim_compat = TRUE) tmp3 = data %>% rt_cori(ip=example_ip(), window=c(5:14), approx=TRUE) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(class = "EpiEstim"), tmp3 %>% dplyr::mutate(class = "Cori+") ) %>% dplyr::group_by(class) if (interactive()) { plot_rt(comp, date_labels="%b %y")+sim_geom_function(data,colour="black")+ ggplot2::coord_cartesian(ylim=c(0.5,3.0)) }data = example_poisson_rt_smooth() tmp2 = data %>% rt_cori(ip=example_ip(), epiestim_compat = TRUE) tmp3 = data %>% rt_cori(ip=example_ip(), window=c(5:14), approx=TRUE) comp = dplyr::bind_rows( tmp2 %>% dplyr::mutate(class = "EpiEstim"), tmp3 %>% dplyr::mutate(class = "Cori+") ) %>% dplyr::group_by(class) if (interactive()) { plot_rt(comp, date_labels="%b %y")+sim_geom_function(data,colour="black")+ ggplot2::coord_cartesian(ylim=c(0.5,3.0)) }
EpiEstim reproduction number wrapper functionCalculate a reproduction number estimate from incidence data using the
EpiEstim library and an empirical generation time distribution. This uses
resampling to transmit uncertainty in generation time estimates. This is
quite slow for each time series depending on the number of bootstraps and
samples in the infectivity profile.
rt_epiestim( df = i_incidence_input, ip = i_discrete_ip, bootstraps = 2000, window = 14, mean_prior = 1, std_prior = 2, ..., .progress = interactive() )rt_epiestim( df = i_incidence_input, ip = i_discrete_ip, bootstraps = 2000, window = 14, mean_prior = 1, std_prior = 2, ..., .progress = interactive() )
df |
Count data. Extra groups are allowed. - a dataframe with columns:
Any grouping allowed. |
ip |
infectivity profile - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
bootstraps |
|
window |
|
mean_prior |
the prior for the $R_t$ estimate. When sample size is low the
$R_t$ estimate will revert to this prior. In |
std_prior |
the prior for the $R_t$ SD. |
... |
not used |
.progress |
show a CLI progress bar |
This will calculate a reproduction number for each group in the input dataframe.
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
data = example_poisson_rt_smooth() tmp2 = data %>% rt_epiestim(ip=example_ip()) if (interactive()) { plot_rt(tmp2, date_labels="%b %y")+sim_geom_function(data,colour="red") }data = example_poisson_rt_smooth() tmp2 = data %>% rt_epiestim(ip=example_ip()) if (interactive()) { plot_rt(tmp2, date_labels="%b %y")+sim_geom_function(data,colour="red") }
Calculate a reproduction number estimate from growth rate using the Wallinga and Lipsitch 2007 estimation using empirical generation time distribution. This uses resampling to transmit uncertainty in growth rate estimates. This also handles time-series that are not on a daily cadence (although this is experimental). The reproduction number estimate is neither a instantaneous (backward looking) nor case (forward looking) reproduction number but somewhere between the two, as the method looks at a flux of infection at a single point in time.
rt_from_growth_rate( df = i_growth_rate, ip = i_empirical_ip, bootstraps = 1000, seed = Sys.time(), .progress = interactive() )rt_from_growth_rate( df = i_growth_rate, ip = i_empirical_ip, bootstraps = 1000, seed = Sys.time(), .progress = interactive() )
df |
Growth rate estimates - a dataframe with columns:
Any grouping allowed. |
ip |
Infectivity profile - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
bootstraps |
|
seed |
a random number generator seed |
.progress |
show a CLI progress bar |
This method is quite slow compared to some others and the default is non deterministic.
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
data = example_poisson_rt_smooth() tmp = data %>% poisson_locfit_model() %>% rt_from_growth_rate(ip=example_ip()) if (interactive()) { plot_rt(tmp, date_labels="%b %y")+sim_geom_function(data,colour="red") }data = example_poisson_rt_smooth() tmp = data %>% poisson_locfit_model() %>% rt_from_growth_rate(ip=example_ip()) if (interactive()) { plot_rt(tmp, date_labels="%b %y")+sim_geom_function(data,colour="red") }
Calculate a reproduction number estimate from modelled incidence using the
methods described in the vignette "Estimating the reproduction number from
modelled incidence" and using a set of empirical generation time
distributions. This assumes that modelled incidence has the same time unit as
the ip distribution, and that this is daily, if this is not the case then
rescale_model() may be able to fix it.
rt_from_incidence( df = i_incidence_model, ip = i_discrete_ip, raw = i_incidence_data, approx = TRUE, .progress = interactive() )rt_from_incidence( df = i_incidence_model, ip = i_discrete_ip, raw = i_incidence_data, approx = TRUE, .progress = interactive() )
df |
modelled incidence estimate - a dataframe with columns:
Any grouping allowed. |
ip |
an infectivity profile (aka generation time distribution) - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
raw |
the raw data that the modelled incidence is based on. This is optional. If not given the algorithm will assume independence, which will be faster to estimate (much faster for long time-series), but with more uncertain Rt estimates. In some circumstances this assumption of independence can cause underestimation of Rt. If this is a risk there will be a warning given, and this parameter may need to be supplied. - a dataframe with columns:
Any grouping allowed. |
approx |
use a faster, but approximate, estimate of quantiles |
.progress |
show a CLI progress bar |
N.B. for certain estimators (e.g. poisson_gam_model(),
poisson_locfit_model()) a version of this function will be called
automatically if the infectivity profile is supplied. The inbuilt version is
preferred over this function for these estimators, as the full covariance
matrix may be used and the initial part of the outbreak can be predicted more
accurately. This function is for supporting the other count models in
ggoutbreak. If you are rolling your own incidence estimates and want an Rt
estimate then rt_incidence_timeseries_implementation() maybe better suited
to your task.
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
tmp = example_poisson_rt_smooth() %>% poisson_locfit_model() %>% rt_from_incidence( ip = example_ip(), raw = example_poisson_rt_smooth(), approx=FALSE ) # This will assume independence and tmp2 = example_poisson_rt_smooth()%>% poisson_locfit_model() %>% rt_from_incidence(ip = example_ip(), approx=TRUE) plot_data = dplyr::bind_rows( tmp %>% dplyr::mutate(class = "exact"), tmp2 %>% dplyr::mutate(class = "approx"), ) %>% dplyr::group_by(class) if (interactive()) { plot_rt(plot_data, date_labels="%b %y")+ sim_geom_function(example_poisson_rt_smooth())+ ggplot2::coord_cartesian(ylim=c(0.5,3))+ ggplot2::facet_wrap(~class) }tmp = example_poisson_rt_smooth() %>% poisson_locfit_model() %>% rt_from_incidence( ip = example_ip(), raw = example_poisson_rt_smooth(), approx=FALSE ) # This will assume independence and tmp2 = example_poisson_rt_smooth()%>% poisson_locfit_model() %>% rt_from_incidence(ip = example_ip(), approx=TRUE) plot_data = dplyr::bind_rows( tmp %>% dplyr::mutate(class = "exact"), tmp2 %>% dplyr::mutate(class = "approx"), ) %>% dplyr::group_by(class) if (interactive()) { plot_rt(plot_data, date_labels="%b %y")+ sim_geom_function(example_poisson_rt_smooth())+ ggplot2::coord_cartesian(ylim=c(0.5,3))+ ggplot2::facet_wrap(~class) }
Calculate a reproduction number estimate from modelled incidence estimates,
by statistical sampling from a log-normally distributed incidence estimate,
such as you get from a I_t ~ Poisson(I_0 e^{rt}) model using a log link
function. This is combined with uncertain infectivity profile specified as
multiple discrete empirical distributions, to calculate the range of possible
values for $R_t$.
rt_from_renewal( df = i_incidence_model, ip = i_discrete_ip, bootstraps = 1000, seed = Sys.time(), .progress = interactive() )rt_from_renewal( df = i_incidence_model, ip = i_discrete_ip, bootstraps = 1000, seed = Sys.time(), .progress = interactive() )
df |
modelled incidence estimate - a dataframe with columns:
Any grouping allowed. |
ip |
infectivity profile - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
bootstraps |
the number of samples to take at each time point. This will be rounded up to a whole multiple of the infectivity profile distribution length. |
seed |
a random number seed for reproducibility |
.progress |
show a CLI progress bar |
This method is moderately slow and non deterministic by default.
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
data = example_poisson_rt_smooth() tmp2 = data %>% poisson_locfit_model() %>% rt_from_renewal(ip=example_ip()) if (interactive()) { plot_rt(tmp2, date_labels="%b %y")+sim_geom_function(data,colour="red") }data = example_poisson_rt_smooth() tmp2 = data %>% poisson_locfit_model() %>% rt_from_renewal(ip=example_ip()) if (interactive()) { plot_rt(tmp2, date_labels="%b %y")+sim_geom_function(data,colour="red") }
This function estimates the reproduction number for a specific point in time
given a time series of log-normally distributed incidence estimates, a set of
infectivity profiles. This version of the algorithm only works for a single
time point and is not optimised for running over a whole time series. For that
please see rt_incidence_timeseries_implementation().
rt_incidence_reference_implementation( mu_t, vcov_ij = diag(sigma_t^2), omega, sigma_t = NULL, tau_offset = 0 )rt_incidence_reference_implementation( mu_t, vcov_ij = diag(sigma_t^2), omega, sigma_t = NULL, tau_offset = 0 )
mu_t |
a vector of meanlog parameters of a lognormal distribution of
modelled incidence. This is a vector of length |
vcov_ij |
a log scale variance-covariance matrix of predictions. It is
optional but if not given and |
omega |
a matrix (or vector) representing the infectivity profile as a
discrete time probability distribution. This must be a |
sigma_t |
if |
tau_offset |
In most cases the infectivity profile has support for
delays from |
N.B. the description of this algorithm is given here: https://ai4ci.github.io/ggoutbreak/articles/rt-from-incidence.html
a list with the following items:
time_Rt: the time point of the Rt esitmate (usually k-1)
mean_Rt_star: the mean of the Rt estimate
var_Rt_star: the variance of the Rt estimate
meanlog_Rt_star: log normal parameter for approximate distribution of Rt
sdlog_Rt_star: log normal parameter for approximate distribution of Rt
mu_Rt_mix: a vector of log normal parameters for more exact mixture distribution of Rt
sigma_Rt_mix: a vector of log normal parameters for more exact mixture distribution of Rt
quantile_Rt_fn: A log-normal mixture quantile function for Rt
Quantiles can either be obtained from the quantile function or from e.g.
qlnorm(p, meanlog_Rt_star, sdlog_Rt_star)
data = example_poisson_rt_smooth() omega = omega_matrix(example_ip()) k = nrow(omega) pred_time = 50 index = pred_time-k:1+1 # we will try and estimate the Rt at time k+10: print(data$rt[pred_time]) # first we need a set of incidence estimates # in the simplest example we use a GLM: newdata = dplyr::tibble(time = 1:100) model = stats::glm(count ~ splines::bs(time,df=8), family = "poisson", data=data) pred = stats::predict(model, newdata, se.fit = TRUE) # I've picked a df to make this example work. In real life you would need # to validate the incidence model is sensible and not over fitting # before using it to estimate RT: # ggplot2::ggplot()+ # ggplot2::geom_point(data=data, ggplot2::aes(x=time, y=count))+ # ggplot2::geom_line( # data = newdata %>% dplyr::mutate(fit = exp(pred$fit) # ), ggplot2::aes(x=time,y=fit)) mu_t = pred$fit[index] sigma_t = pred$se.fit[index] # prediction vcov is not simple from GLM models. rt_est = rt_incidence_reference_implementation(mu_t=mu_t,sigma_t = sigma_t, omega = omega) # quantiles from a mixture distribution: rt_est$quantile_Rt_fn(c(0.025,0.5,0.975)) # and from the rough estimate based on matching of moments: stats::qlnorm(c(0.025,0.5,0.975), rt_est$meanlog_Rt_star, rt_est$sdlog_Rt_star) # GLM do not produce vcov matrices we can estimate them if we have access to # the data: vcov_glm = vcov_from_residuals(data$count[1:100], pred$fit, pred$se.fit)$vcov_matrix vcov_glm_ij = vcov_glm[index, index] # Using an estimated vcov we get very similar answers: rt_est_vcov = rt_incidence_reference_implementation(mu_t=mu_t,vcov_ij = vcov_glm_ij, omega = omega) rt_est_vcov$quantile_Rt_fn(c(0.025,0.5,0.975)) # In theory assuming independence leads to excess uncertainty and possible # underestimation bias. In most situations though this appears low risk. # Lets do the same with a GAM: model2 = mgcv::gam(count ~ s(time), family = "poisson", data=data) pred2 = stats::predict(model2, newdata, se.fit = TRUE) # we can get prediction level vcov from GAMs easily Xp = stats::predict(model2, newdata, type = "lpmatrix") pred_vcov = Xp %*% stats::vcov(model2) %*% t(Xp) vcov_ij = pred_vcov[index, index] mu_t2 = pred2$fit[index] rt_est2 = rt_incidence_reference_implementation(mu_t=mu_t2, vcov_ij=vcov_ij, omega = omega) rt_est2$quantile_Rt_fn(c(0.025,0.5,0.975)) # How does this compare to EpiEstim: # N.B. setting seed to make deterministic withr::with_seed(100, { epi = EpiEstim::estimate_R( data$count, method = "si_from_sample", si_sample = omega, config = EpiEstim::make_config( method = "si_from_sample", t_start = pred_time-7, t_end = pred_time, n2 = 100) ) }) epi$R %>% dplyr::select(`Quantile.0.025(R)`, `Median(R)`, `Quantile.0.975(R)`)data = example_poisson_rt_smooth() omega = omega_matrix(example_ip()) k = nrow(omega) pred_time = 50 index = pred_time-k:1+1 # we will try and estimate the Rt at time k+10: print(data$rt[pred_time]) # first we need a set of incidence estimates # in the simplest example we use a GLM: newdata = dplyr::tibble(time = 1:100) model = stats::glm(count ~ splines::bs(time,df=8), family = "poisson", data=data) pred = stats::predict(model, newdata, se.fit = TRUE) # I've picked a df to make this example work. In real life you would need # to validate the incidence model is sensible and not over fitting # before using it to estimate RT: # ggplot2::ggplot()+ # ggplot2::geom_point(data=data, ggplot2::aes(x=time, y=count))+ # ggplot2::geom_line( # data = newdata %>% dplyr::mutate(fit = exp(pred$fit) # ), ggplot2::aes(x=time,y=fit)) mu_t = pred$fit[index] sigma_t = pred$se.fit[index] # prediction vcov is not simple from GLM models. rt_est = rt_incidence_reference_implementation(mu_t=mu_t,sigma_t = sigma_t, omega = omega) # quantiles from a mixture distribution: rt_est$quantile_Rt_fn(c(0.025,0.5,0.975)) # and from the rough estimate based on matching of moments: stats::qlnorm(c(0.025,0.5,0.975), rt_est$meanlog_Rt_star, rt_est$sdlog_Rt_star) # GLM do not produce vcov matrices we can estimate them if we have access to # the data: vcov_glm = vcov_from_residuals(data$count[1:100], pred$fit, pred$se.fit)$vcov_matrix vcov_glm_ij = vcov_glm[index, index] # Using an estimated vcov we get very similar answers: rt_est_vcov = rt_incidence_reference_implementation(mu_t=mu_t,vcov_ij = vcov_glm_ij, omega = omega) rt_est_vcov$quantile_Rt_fn(c(0.025,0.5,0.975)) # In theory assuming independence leads to excess uncertainty and possible # underestimation bias. In most situations though this appears low risk. # Lets do the same with a GAM: model2 = mgcv::gam(count ~ s(time), family = "poisson", data=data) pred2 = stats::predict(model2, newdata, se.fit = TRUE) # we can get prediction level vcov from GAMs easily Xp = stats::predict(model2, newdata, type = "lpmatrix") pred_vcov = Xp %*% stats::vcov(model2) %*% t(Xp) vcov_ij = pred_vcov[index, index] mu_t2 = pred2$fit[index] rt_est2 = rt_incidence_reference_implementation(mu_t=mu_t2, vcov_ij=vcov_ij, omega = omega) rt_est2$quantile_Rt_fn(c(0.025,0.5,0.975)) # How does this compare to EpiEstim: # N.B. setting seed to make deterministic withr::with_seed(100, { epi = EpiEstim::estimate_R( data$count, method = "si_from_sample", si_sample = omega, config = EpiEstim::make_config( method = "si_from_sample", t_start = pred_time-7, t_end = pred_time, n2 = 100) ) }) epi$R %>% dplyr::select(`Quantile.0.025(R)`, `Median(R)`, `Quantile.0.975(R)`)
This function estimates the reproduction number for a while time series given log-normally distributed incidence estimates, and a set of infectivity profiles. This version of the algorithm is optimised for running over all of a single time series. The algorithm will not produce Rt estimates until there is at least the
rt_incidence_timeseries_implementation( time, mu, vcov = NULL, sigma = NULL, ip = i_discrete_ip, tidy = FALSE, ... )rt_incidence_timeseries_implementation( time, mu, vcov = NULL, sigma = NULL, ip = i_discrete_ip, tidy = FALSE, ... )
time |
a set of time points as a numeric vector. This is a vector of length |
mu |
a time series of meanlog parameters of a lognormal distribution of
modelled incidence. This is a vector of length |
vcov |
a log scale variance-covariance matrix of predictions. It is
optional but if not given and |
sigma |
if |
ip |
a long format infectivity profile dataframe. - a dataframe with columns:
Minimally grouped by: boot (and other groupings allowed). |
tidy |
do you want detailed raw output ( |
... |
passed onto output formatter if |
N.B. the description of this algorithm is given here: https://ai4ci.github.io/ggoutbreak/articles/rt-from-incidence.html
a dataframe with k rows with the following columns:
time_Rt: the time point of the Rt estimate (usually k-1)
mean_Rt_star: the mean of the Rt estimate
var_Rt_star: the variance of the Rt estimate
meanlog_Rt_star: log normal parameter for approximate distribution of Rt
sdlog_Rt_star: log normal parameter for approximate distribution of Rt
mu_Rt_mix: a list of vectors of log normal parameters for more exact mixture distribution of Rt
sigma_Rt_mix: a list of vectors of log normal parameters for more exact mixture distribution of Rt
Approximate quantiles can be obtained from e.g. qlnorm(0.5, meanlog_Rt_star, sdlog_Rt_star)
Alternatively if tidy is true the output will be post processed to conform to:
A dataframe containing the following columns:
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
rt.fit (double) - an estimate of the reproduction number
rt.se.fit (positive_double) - the standard error of the reproduction number
rt.0.025 (double) - lower confidence limit of the reproduction number
rt.0.5 (double) - median estimate of the reproduction number
rt.0.975 (double) - upper confidence limit of the reproduction number
Any grouping allowed.
data = example_poisson_rt_smooth() ip = example_ip() # first we need a set of incidence estimates # we fit a poisson model to counts using a GAM: model = mgcv::gam(count ~ s(time), family = "poisson", data=data) ip_len = max(ip$tau) newdata = dplyr::tibble(time = -ip_len:100) pred = stats::predict(model, newdata, se.fit = TRUE) # we can get prediction vcov from GAMs fairly easily Xp = stats::predict(model, newdata, type = "lpmatrix") pred_vcov = Xp %*% stats::vcov(model) %*% t(Xp) # Now we estimate rt rt_est = rt_incidence_timeseries_implementation( time = newdata$time, mu = pred$fit, vcov = pred_vcov, ip = ip) # Lets compare epiestim on the same data withr::with_seed(100, { epi = EpiEstim::estimate_R( data$count, method = "si_from_sample", si_sample = omega_matrix(ip), config = EpiEstim::make_config( method = "si_from_sample", t_start = 10:100-7, t_end = 10:100, n2 = 100) ) }) ggplot2::ggplot()+ ggplot2::geom_line( data=data %>% dplyr::filter(time<100), ggplot2::aes(x=time,y=rt), colour="black")+ ggplot2::geom_line( data = rt_est, ggplot2::aes(x=time, y=mean_Rt_star, colour="gam+rt"))+ ggplot2::geom_line( data = epi$R, ggplot2::aes(x=t_end, y=`Mean(R)`, colour="epiestim")) # mean bias of GAM+rt estimate: mean(data$rt[seq_along(rt_est$mean_Rt_star)] - rt_est$mean_Rt_star)data = example_poisson_rt_smooth() ip = example_ip() # first we need a set of incidence estimates # we fit a poisson model to counts using a GAM: model = mgcv::gam(count ~ s(time), family = "poisson", data=data) ip_len = max(ip$tau) newdata = dplyr::tibble(time = -ip_len:100) pred = stats::predict(model, newdata, se.fit = TRUE) # we can get prediction vcov from GAMs fairly easily Xp = stats::predict(model, newdata, type = "lpmatrix") pred_vcov = Xp %*% stats::vcov(model) %*% t(Xp) # Now we estimate rt rt_est = rt_incidence_timeseries_implementation( time = newdata$time, mu = pred$fit, vcov = pred_vcov, ip = ip) # Lets compare epiestim on the same data withr::with_seed(100, { epi = EpiEstim::estimate_R( data$count, method = "si_from_sample", si_sample = omega_matrix(ip), config = EpiEstim::make_config( method = "si_from_sample", t_start = 10:100-7, t_end = 10:100, n2 = 100) ) }) ggplot2::ggplot()+ ggplot2::geom_line( data=data %>% dplyr::filter(time<100), ggplot2::aes(x=time,y=rt), colour="black")+ ggplot2::geom_line( data = rt_est, ggplot2::aes(x=time, y=mean_Rt_star, colour="gam+rt"))+ ggplot2::geom_line( data = epi$R, ggplot2::aes(x=t_end, y=`Mean(R)`, colour="epiestim")) # mean bias of GAM+rt estimate: mean(data$rt[seq_along(rt_est$mean_Rt_star)] - rt_est$mean_Rt_star)
The wedge distribution has a support of 0 to 1 and has a linear probability density function over that support.
rwedge(n, a)rwedge(n, a)
n |
number of observations |
a |
a gradient from -2 (left skewed) to 2 (right skewed) |
The rwedge can be combined with quantile functions to
skew standard distributions, or introduce correlation or down weight
certain parts of the distribution.
a vector of probabilities, quantiles, densities or samples.
pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )
A log1p x scale
scale_x_log1p(..., n = 5, base = 10, dp = 0)scale_x_log1p(..., n = 5, base = 10, dp = 0)
... |
Other arguments passed on to |
n |
the number of major breaks |
base |
the base for the logarithm |
dp |
decimal points |
a ggplot scale
A logit x scale
scale_x_logit(...)scale_x_logit(...)
... |
Other arguments passed on to |
a ggplot scale
A log1p y scale
scale_y_log1p(..., n = 5, base = 10, dp = 0)scale_y_log1p(..., n = 5, base = 10, dp = 0)
... |
Other arguments passed on to |
n |
the number of major breaks |
base |
the base for the logarithm |
dp |
decimal points |
a ggplot scale
A logit y scale
scale_y_logit(...)scale_y_logit(...)
... |
Other arguments passed on to |
a ggplot scale
This performs a range of continuous scoring metrics for each estimate time-point using cumulative distribution functions for each estimate. Point quality metrics are calculated for each estimate provided and summarised. Summarisation is performed using bootstrap resampling to generate confidence intervals for summary statistics, which are presented as median +/1 95% CI.
score_estimate( est, obs, lags = NULL, summarise_by = est %>% dplyr::groups(), bootstraps = 1000, raw_bootstraps = FALSE, seed = 100 )score_estimate( est, obs, lags = NULL, summarise_by = est %>% dplyr::groups(), bootstraps = 1000, raw_bootstraps = FALSE, seed = 100 )
est |
a dataframe of estimates of incidence, growth rate of reproduction
number based off a simulation or data with known parameters. Each group in
|
obs |
a dataframe of the ground truth, sharing the same grouping and
columns as |
lags |
a data frame of estimate types and lags as output by
|
summarise_by |
by default every group is treated separately. This can be
overridden with a |
bootstraps |
the number of bootstrap replicates to draw for assessing metric confidence. If FALSE then no bootstrapping will be done and the metrics returned will have no confidence intervals. |
raw_bootstraps |
(defaults to FALSE) return the summary metrics for each bootstrap rather than the quantiles of the summary metrics. |
seed |
a random seed for reproducibility |
a dataframe of scoring metrics, with one row per group. This includes the following columns:
mean_quantile_bias - the average of the universal residuals. Lower values
are better.
mean_trans_bias - the bias on the link function scale.
link - the link function
mean_bias - the bias on the natural scale (which may be interpreted as
additive or multiplicative depending on the link)
pit_was - an unadjusted probability integral transform histogram
Wasserstein distance from the uniform (lower values are better).
unbiased_pit_was - an PIT Wasserstein distance from the uniform, adjusted
for estimator bias (lower values are better). This is a measure of
calibration.
directed_pit_was - a PIT Wasserstein distance from the uniform, directed
away from the centre, adjusted for estimator bias (values closer to zero
are better, positive values indicate overconfidence, and negative values
excessively conservative estimates).
percent_iqr_coverage - the percentage of estimators that include the true
value in their IQR. For a perfectly calibrated estimate this should be 0.5.
Lower values reflect overconfidence, higher values reflect excessively
conservative estimates. This is a measure of calibration but is influenced
by bias.
unbiased_percent_iqr_coverage - the percentage of estimators that include
the true value in their IQR once adjusted for bias. This should be 0.5. This
is a measure of calibration, and tells you which direction (smaller numbers
are over-confident, larger values excessively conservative).
mean_prediction_interval_width_50 - the prediction interval width is a
measure of sharpness (smaller values are sharper). Sharper estimators are
superior if they are unbiased and well calibrated.
mean_crps - the mean value of the continuous rank probability score for
each point estimate (lower values are better)
mean_unbiased_crps - the mean value of the continuous rank probability
score for each point estimate assessed after adjustment for bias (lower
values are better)
threshold_misclassification_probability - if a metric has a natural threshold
like 1 for Rt then this measures how probable it is that the estimate will
propose the epidemic is shrinking when it is growing and vice versa. Lower is
better
other outputs are possible if summarise_by is false.
data = example_poisson_rt_smooth() pipeline = ~ .x %>% poisson_locfit_model(ip = .y, quick=TRUE) lags = quantify_lag(pipeline, ip = example_ip()) withr::with_options(list("ggoutbreak.keep_cdf"=TRUE),{ est = data %>% poisson_locfit_model(ip = example_ip(), quick=TRUE) }) if (interactive()) plot_rt(est)+sim_geom_function(data, colour="red") obs = data %>% dplyr::mutate(rt.obs = rt, incidence.obs = rate) score_estimate(est,obs,lags) %>% dplyr::glimpse()data = example_poisson_rt_smooth() pipeline = ~ .x %>% poisson_locfit_model(ip = .y, quick=TRUE) lags = quantify_lag(pipeline, ip = example_ip()) withr::with_options(list("ggoutbreak.keep_cdf"=TRUE),{ est = data %>% poisson_locfit_model(ip = example_ip(), quick=TRUE) }) if (interactive()) plot_rt(est)+sim_geom_function(data, colour="red") obs = data %>% dplyr::mutate(rt.obs = rt, incidence.obs = rate) score_estimate(est,obs,lags) %>% dplyr::glimpse()
This function is generally not needed, and will be called automatically
when the first date conversion is performed. If no other information is
given then the default origin will be decided by the start of the first use of the
time_period class in a session. This helps to keep defaults consistent
during a single run, if they are not specified.
set_defaults(start_date, unit) with_defaults(start_date, unit, expr) set_default_start(date) set_default_unit(unit)set_defaults(start_date, unit) with_defaults(start_date, unit, expr) set_default_start(date) set_default_unit(unit)
start_date |
the zero time date as something that can be coerced to a
date. If the |
unit |
the length of one unit of time. This will be either a integer
number of days, or a specification such as "1 week", or another |
expr |
an expression to evaluate with the defaults set to the provided values. |
date |
A date, or something that can be cast to one, that represents day zero for an outbreak. |
depending on the methods the original default start date / the
original default unit, a list of both or the result of evaluating the
expression expr
with_defaults(): Set defaults temporarily and execute expression
set_default_unit(): Set the default unit only
# set default origin and cadence: old = set_defaults("2025-01-01", "1 week") # this sets the default for interpreting underqualified time_periods: print(as.time_period(1:10)) # The default can always be overridden on a case by case basis: print(as.time_period(1:10, unit="1 day")) # or for a whole expression: with_defaults("2020-01-01", "1 day", { print(as.time_period(1:10)) }) # components can be changed individually, firstly origin: set_default_start("2025-01-01") print(as.time_period(1:10)) # now cadence: set_default_unit("1 day") print(as.time_period(1:10)) # clear the values: set_defaults(NULL,NULL) # A sufficiently qualified call will set the defaults: defined = as.time_period(as.Date("2024-01-01")+0:10*7, anchor="start") inherit = as.time_period(0:10) all(defined == inherit) # restoring the original values (which might be null) set_defaults(old)# set default origin and cadence: old = set_defaults("2025-01-01", "1 week") # this sets the default for interpreting underqualified time_periods: print(as.time_period(1:10)) # The default can always be overridden on a case by case basis: print(as.time_period(1:10, unit="1 day")) # or for a whole expression: with_defaults("2020-01-01", "1 day", { print(as.time_period(1:10)) }) # components can be changed individually, firstly origin: set_default_start("2025-01-01") print(as.time_period(1:10)) # now cadence: set_default_unit("1 day") print(as.time_period(1:10)) # clear the values: set_defaults(NULL,NULL) # A sufficiently qualified call will set the defaults: defined = as.time_period(as.Date("2024-01-01")+0:10*7, anchor="start") inherit = as.time_period(0:10) all(defined == inherit) # restoring the original values (which might be null) set_defaults(old)
Apply a ascertainment bias to the observed case counts.
sim_apply_ascertainment(df = i_sim_count_data, fn_asc = ~1, seed = Sys.time())sim_apply_ascertainment(df = i_sim_count_data, fn_asc = ~1, seed = Sys.time())
df |
a count dataframe from e.g.
Minimally grouped by: statistic (and other groupings allowed). |
fn_asc |
a function that takes a single input vector |
seed |
a RNG seed |
a dataframe with original column, and count column modified to
include ascertainment bias.
with_defaults("2025-01-01" ,"1 day", { dplyr::tibble( statistic = "incidence", time=as.time_period(1:10,"1 day"), count=rep(100,10) ) %>% dplyr::group_by(statistic) %>% sim_apply_ascertainment(~ ifelse(.x<=5,0.1,0.9)) })with_defaults("2025-01-01" ,"1 day", { dplyr::tibble( statistic = "incidence", time=as.time_period(1:10,"1 day"), count=rep(100,10) ) %>% dplyr::group_by(statistic) %>% sim_apply_ascertainment(~ ifelse(.x<=5,0.1,0.9)) })
Events include symptom onset, admission, death, test sampling, test processing
sim_apply_delay( df, ..., fn_p_symptomatic = ~0.5, fn_p_admitted = ~0.1, fn_p_died = ~0.05, fn_p_tested = ~0.8, seed = Sys.time() )sim_apply_delay( df, ..., fn_p_symptomatic = ~0.5, fn_p_admitted = ~0.1, fn_p_died = ~0.05, fn_p_tested = ~0.8, seed = Sys.time() )
df |
a line list dataframe arising from e.g.
Any grouping allowed. OR with columns:
Minimally grouped by: statistic (and other groupings allowed). |
... |
Named arguments passed on to
Named arguments passed on to
|
fn_p_symptomatic, fn_p_admitted, fn_p_died, fn_p_tested
|
Function that
returns a probability between 0 and 1 for each row of the input dataframe.
A |
seed |
RNG seed for reproducibility |
Depends on input, either:
a wide format line list with additional XX, XX_time and XX_delay columns,
for each of the set of statistics generated.
a long format set of counts of different statistics i.e. infections,
symptoms, admission, death, sample (tests taken), results (test results) .
tmp = sim_branching_process( changes = dplyr::tibble(t = c(0,20,40,60,80,110), R = c(1.8,1.5,0.9,1.5,0.8,1.2)), max_time = 120, seed = 100 ) tmp2 = tmp %>% sim_apply_delay() tmp2 %>% dplyr::glimpse()tmp = sim_branching_process( changes = dplyr::tibble(t = c(0,20,40,60,80,110), R = c(1.8,1.5,0.9,1.5,0.8,1.2)), max_time = 120, seed = 100 ) tmp2 = tmp %>% sim_apply_delay() tmp2 %>% dplyr::glimpse()
Generate a line list from a branching process model parametrised by reproduction number
sim_branching_process( changes = dplyr::tibble(t = c(0, 40), rt = c(2, 0.8)), max_time = 80, seed = Sys.time(), fn_Rt = cfg_step_fn(changes), fn_ip = ~example_ip(), fn_kappa = ~1, imports_df = NULL, fn_imports = ~ifelse(.x == 0, 30, 0), fn_list_next_gen = list(), max_size = 10000, ... )sim_branching_process( changes = dplyr::tibble(t = c(0, 40), rt = c(2, 0.8)), max_time = 80, seed = Sys.time(), fn_Rt = cfg_step_fn(changes), fn_ip = ~example_ip(), fn_kappa = ~1, imports_df = NULL, fn_imports = ~ifelse(.x == 0, 30, 0), fn_list_next_gen = list(), max_size = 10000, ... )
changes |
a dataframe containing a |
max_time |
maximum duration of simulation |
seed |
random seed |
fn_Rt |
can be specified instead of |
fn_ip |
a function that takes input vector |
fn_kappa |
a vectorised function taking |
imports_df |
a data frame containing minimally |
fn_imports |
a time varying function the defines the number of infected
importations. If |
fn_list_next_gen |
a named list of functions. The name corresponds to
metadata columns in the simulation, the function is a |
max_size |
the maximum size of a single generation. If a generation exceeds this limit the branching process terminates with a warning that the simulation is incomplete. |
... |
not used |
a line list of cases for a simulated outbreak
A dataframe containing the following columns:
id (unique_id) - Patient level unique id
time (ggoutbreak::time_period) - Time of infection. A 'time_period'
Any grouping possible.
tmp = sim_branching_process( changes = dplyr::tibble(t = c(0,40), R = c(1.5,0.8)), max_time = 120, seed = 100, fn_imports = ~ ifelse(.x<10,1,0) ) if(interactive()) { plot_cases(tmp, mapping=ggplot2::aes(fill=as.factor(generation)),linewidth=0.1, colour="white") } # imports can also be specified as a dataframe, which allows additional # metadata in the line list. An example of which is as follows: imports_df = dplyr::tribble( ~time, ~variant, ~count, 0:4, "wild-type", 100, 10:14, "alpha", 5, )tmp = sim_branching_process( changes = dplyr::tibble(t = c(0,40), R = c(1.5,0.8)), max_time = 120, seed = 100, fn_imports = ~ ifelse(.x<10,1,0) ) if(interactive()) { plot_cases(tmp, mapping=ggplot2::aes(fill=as.factor(generation)),linewidth=0.1, colour="white") } # imports can also be specified as a dataframe, which allows additional # metadata in the line list. An example of which is as follows: imports_df = dplyr::tribble( ~time, ~variant, ~count, 0:4, "wild-type", 100, 10:14, "alpha", 5, )
Standard convolution assumes one delay distribution. This is actually not what we see in reality as delays can depend on any factors, including the day of week. This function applies a convolution to an input time-series when that convolution is expressed as a function (usually of time, but can be of anything in the input dataframe). The convolution is then sampled using a poisson or negative binomial
sim_convolution( df = i_sim_count_data, p_fn, delay_fn, ..., input = "infections", output, kappa = 1, from = c("count", "rate") )sim_convolution( df = i_sim_count_data, p_fn, delay_fn, ..., input = "infections", output, kappa = 1, from = c("count", "rate") )
df |
a count dataframe from e.g.
Minimally grouped by: statistic (and other groupings allowed). |
p_fn |
a function that takes a time parameter and potentially and
returns probability of observation given something occurs. a no-op for this
parameter is |
delay_fn |
a function that takes time and returns the probability of
observation (given it occurred) over time since infection (i.e. |
... |
not used |
input |
the input statistic |
output |
the output statistic |
kappa |
dispersion. scaled such that poisson dispersion is 1. Values must be 0 (no dispersion), 1 (poisson dispersion) or greater than 1 for over-dispersion. |
from |
Controls if you base future counts on previous counts or on
underlying rate, defaults to |
return the result of applying this convolution to the data.
weekday_delay = make_fixed_ip(mean = 5, sd = 2) weekend_delay = make_fixed_ip(mean = 6, sd = 2) delay_fn = ~ ifelse(.x %% 7 %in% c(6,7), list(weekend_delay), list(weekday_delay)) p_fn = ~ ifelse(.x < 20, 0.5, 0.75) data = dplyr::tibble( time=1:40, count = rep(100,40), rate = rep(100,40), statistic="infections") %>% dplyr::group_by(statistic) delayed = data %>% sim_convolution(p_fn,delay_fn,output="delayed") %>% dplyr::filter(statistic=="delayed") if (interactive()) ggplot2::ggplot(delayed,ggplot2::aes(x=time))+ ggplot2::geom_line(ggplot2::aes(y=rate))+ ggplot2::geom_line(ggplot2::aes(y=count)) # other example delay functions delay_fn = cfg_gamma_ip_fn( ~ ifelse(.x<5, 8, 4))weekday_delay = make_fixed_ip(mean = 5, sd = 2) weekend_delay = make_fixed_ip(mean = 6, sd = 2) delay_fn = ~ ifelse(.x %% 7 %in% c(6,7), list(weekend_delay), list(weekday_delay)) p_fn = ~ ifelse(.x < 20, 0.5, 0.75) data = dplyr::tibble( time=1:40, count = rep(100,40), rate = rep(100,40), statistic="infections") %>% dplyr::group_by(statistic) delayed = data %>% sim_convolution(p_fn,delay_fn,output="delayed") %>% dplyr::filter(statistic=="delayed") if (interactive()) ggplot2::ggplot(delayed,ggplot2::aes(x=time))+ ggplot2::geom_line(ggplot2::aes(y=rate))+ ggplot2::geom_line(ggplot2::aes(y=count)) # other example delay functions delay_fn = cfg_gamma_ip_fn( ~ ifelse(.x<5, 8, 4))
Apply a time-varying probability and delay function to linelist data
sim_delay( df = i_sim_linelist, p_fn, delay_fn, input = "time", output = "event", seed = Sys.time() )sim_delay( df = i_sim_linelist, p_fn, delay_fn, input = "time", output = "event", seed = Sys.time() )
df |
a line list dataframe arising from e.g.
Any grouping allowed. |
p_fn |
Function that returns a probability between 0 and 1 for each row
of the input dataframe. A |
delay_fn |
A function that calculates the time to event onset from the
|
input |
a time column from which to calculate the delay from. |
output |
an output column set name (defaults to |
seed |
RNG seed for reproducibility |
the line list with extra columns with prefix given by output,
specifying whether the event was observed, the delay and the simulation
time.
tmp = sim_branching_process( changes = dplyr::tibble(t = c(0,20,40,60,80,110), R = c(1.8,1.5,0.9,1.5,0.8,1.2)), max_time = 120, seed = 100 ) tmp2 = tmp %>% sim_delay( p_fn = ~ rbern(.x, 0.8), delay_fn = ~ rgamma2(.x, mean = 5), ) tmp2 %>% dplyr::glimpse()tmp = sim_branching_process( changes = dplyr::tibble(t = c(0,20,40,60,80,110), R = c(1.8,1.5,0.9,1.5,0.8,1.2)), max_time = 120, seed = 100 ) tmp2 = tmp %>% sim_delay( p_fn = ~ rbern(.x, 0.8), delay_fn = ~ rgamma2(.x, mean = 5), ) tmp2 %>% dplyr::glimpse()
Delayed observations means that if, for example a case is only attributed to a disease after a delay there is right censoring of the data. There can be very complex patterns of right censoring if for example observations are batched and published weekly. During COVID in the UK some death data was published frequently but some was retrospectively reported on monthly intervals, depending on where the patient died, which lead to complex time dependent biases in the death data. Given the description of the delay, this function will simulate this effect for count data. In another example there were delays reporting test results in the run up to Christmas which resulted in case rates apparently dropping as schools broke up. This could have affected timing of the 2021 Christmas lockdown.
sim_delayed_observation( df = i_sim_count_data, delay_fn, ..., input = "infections", output = input, max_time = max(df$time) )sim_delayed_observation( df = i_sim_count_data, delay_fn, ..., input = "infections", output = input, max_time = max(df$time) )
df |
a count dataframe from e.g. |
delay_fn |
a function that takes time and returns the probability of
observation (given it occurred) over time since infection (i.e. tau) as an
ip delay distribution. This does not have to sum to 1 (e.g. mapping
incidence to prevalence) but if not then it will behave as if some fraction
or events are not observed (or observed multiple times). See
|
... |
not used |
input |
the input statistic (defaults to |
output |
the output column name (defaults to same as |
max_time |
the date on which censoring is taking place. |
the result of applying this right censoring to the data.
weekday_delay = make_fixed_ip(mean = 5, sd = 2) weekend_delay = make_fixed_ip(mean = 7, sd = 2) delay_fn = ~ ifelse(.x %% 7 %in% c(6,7), list(weekend_delay), list(weekday_delay)) data = dplyr::tibble(time=1:40, count = rep(100,40), statistic="infections") %>% dplyr::group_by(statistic) %>% sim_delayed_observation(delay_fn,output="delayed") if (interactive()) ggplot2::ggplot(data,ggplot2::aes(x=time,colour=statistic))+ ggplot2::geom_line(ggplot2::aes(y=count))weekday_delay = make_fixed_ip(mean = 5, sd = 2) weekend_delay = make_fixed_ip(mean = 7, sd = 2) delay_fn = ~ ifelse(.x %% 7 %in% c(6,7), list(weekend_delay), list(weekday_delay)) data = dplyr::tibble(time=1:40, count = rep(100,40), statistic="infections") %>% dplyr::group_by(statistic) %>% sim_delayed_observation(delay_fn,output="delayed") if (interactive()) ggplot2::ggplot(data,ggplot2::aes(x=time,colour=statistic))+ ggplot2::geom_line(ggplot2::aes(y=count))
All the simulations should include details of major changes in the simulation
input parameters, particularly for step functions. This set of events can be
directly represented on a ggoutbreak plot using the events parameter
sim_events(df)sim_events(df)
df |
the output of a |
an events dataframe
sim_events(example_poisson_rt())sim_events(example_poisson_rt())
ggoutbreak simulation as a ggplot2 layer.simulations are typically parameterised by time varying $R_t$ or by growth
rate, and this relationship is embedded in the simulation outputs. Plotting
the value of these on the default ggoutbreak plots requires extracting this
function and rescaling it to align with the dates, which is what this function
does.
sim_geom_function(df, ...)sim_geom_function(df, ...)
df |
the output of a |
... |
Named arguments passed on to
|
a geom_function of the parameter
ggplot2::ggplot()+ sim_geom_function(example_poisson_rt(), xlim=as.Date("2019-12-29")+c(0,80))+ ggplot2::scale_x_date()ggplot2::ggplot()+ sim_geom_function(example_poisson_rt(), xlim=as.Date("2019-12-29")+c(0,80))+ ggplot2::scale_x_date()
Generate a multinomial outbreak defined by per class growth rates and a poisson model
sim_multinomial( changes = dplyr::tibble(t = c(0, 20, 40, 60, 80), variant1 = c(0.1, 0, -0.1, 0, 0.1), variant2 = c(0.15, 0.05, -0.05, -0.01, 0.05), variant3 = c(0, 0.05, -0.05, +0.05, -0.05), ), initial = c(100, 100, 100), time_unit = "1 day", ... )sim_multinomial( changes = dplyr::tibble(t = c(0, 20, 40, 60, 80), variant1 = c(0.1, 0, -0.1, 0, 0.1), variant2 = c(0.15, 0.05, -0.05, -0.01, 0.05), variant3 = c(0, 0.05, -0.05, +0.05, -0.05), ), initial = c(100, 100, 100), time_unit = "1 day", ... )
changes |
a list of time points in column |
initial |
the size of the initial outbreak per class. There should be one entry per class |
time_unit |
e.g. a daily or weekly time series: "1 day", "7 days" |
... |
Named arguments passed on to
|
a case count time series including class, count and time columns
if (interactive()) { plot_counts( sim_multinomial() %>% dplyr::glimpse() ) }if (interactive()) { plot_counts( sim_multinomial() %>% dplyr::glimpse() ) }
Generate an outbreak case count series defined by growth rates using a poisson model.
sim_poisson_model( changes = dplyr::tibble(t = c(0, 20, 40, 60, 80), growth = c(0.1, 0, -0.1, 0, 0.1)), kappa = 1, max_time = 104, seed = Sys.time(), fn_growth = cfg_step_fn(changes), fn_imports = ~ifelse(.x == 0, 100, 0), time_unit = "1 day" )sim_poisson_model( changes = dplyr::tibble(t = c(0, 20, 40, 60, 80), growth = c(0.1, 0, -0.1, 0, 0.1)), kappa = 1, max_time = 104, seed = Sys.time(), fn_growth = cfg_step_fn(changes), fn_imports = ~ifelse(.x == 0, 100, 0), time_unit = "1 day" )
changes |
a dataframe holding change time points ( |
kappa |
a dispersion parameter. 1 is no dispersion (compared to poisson), smaller values mean more dispersion. |
max_time |
the desired length of the time series, |
seed |
a random seed |
fn_growth |
a function that takes input vector |
fn_imports |
a function that takes input vector |
time_unit |
e.g. a daily or weekly time series: "1 day", "7 days" |
A dataframe of case counts
A dataframe containing the following columns:
statistic (character) - An identifier for the statistic, whether that be infections, admissions, deaths
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a 'time_period'
Minimally grouped by: statistic (and other groupings may be present).
tmp2 = sim_poisson_model(seed=100, fn_imports = ~ ifelse(.x %in% c(0,50),100,0)) if (interactive()) { ggplot2::ggplot(tmp2)+ggplot2::geom_point(ggplot2::aes(x=time,y=count)) }tmp2 = sim_poisson_model(seed=100, fn_imports = ~ ifelse(.x %in% c(0,50),100,0)) if (interactive()) { ggplot2::ggplot(tmp2)+ggplot2::geom_point(ggplot2::aes(x=time,y=count)) }
Generate an outbreak case count series defined by Reproduction number using a poisson model.
sim_poisson_Rt_model( changes = dplyr::tibble(t = c(0, 40), rt = c(2.5, 0.8)), kappa = 1, max_time = 80, seed = Sys.time(), fn_Rt = cfg_step_fn(changes), fn_imports = ~ifelse(.x == 0, 30, 0), fn_ip = ~example_ip(), time_unit = "1 day" )sim_poisson_Rt_model( changes = dplyr::tibble(t = c(0, 40), rt = c(2.5, 0.8)), kappa = 1, max_time = 80, seed = Sys.time(), fn_Rt = cfg_step_fn(changes), fn_imports = ~ifelse(.x == 0, 30, 0), fn_ip = ~example_ip(), time_unit = "1 day" )
changes |
a dataframe holding change time points ( |
kappa |
a dispersion parameter. 1 is no dispersion (compared to poisson), smaller values mean more dispersion. |
max_time |
the desired length of the time series, |
seed |
a random seed |
fn_Rt |
a function that takes input vector |
fn_imports |
a function that takes input vector |
fn_ip |
a function that takes input vector |
time_unit |
e.g. a daily or weekly time series: "1 day", "7 days" |
A dataframe of case counts
A dataframe containing the following columns:
statistic (character) - An identifier for the statistic, whether that be infections, admissions, deaths
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a 'time_period'
Minimally grouped by: statistic (and other groupings may be present).
tmp = sim_poisson_Rt_model(kappa=1, seed=100, fn_imports = ~ ifelse(.x %in% c(0,50),100,0)) if (interactive()) { ggplot2::ggplot(tmp,ggplot2::aes(x=time,y=count))+ggplot2::geom_point()+ ggplot2::geom_line() }tmp = sim_poisson_Rt_model(kappa=1, seed=100, fn_imports = ~ ifelse(.x %in% c(0,50),100,0)) if (interactive()) { ggplot2::ggplot(tmp,ggplot2::aes(x=time,y=count))+ggplot2::geom_point()+ ggplot2::geom_line() }
This function simulates an SEIR (Susceptible-Exposed-Infectious-Recovered) model
where the transmission rate (beta) varies over time.
sim_seir_model( changes = dplyr::tibble(t = c(0, 30), dBeta = c(1, 0.5)), mean_latent_period, mean_gen_time, R0 = 2.5, fn_dBeta = cfg_step_fn(changes), N = 10000, imports = 10, max_time = 104, seed = Sys.time() )sim_seir_model( changes = dplyr::tibble(t = c(0, 30), dBeta = c(1, 0.5)), mean_latent_period, mean_gen_time, R0 = 2.5, fn_dBeta = cfg_step_fn(changes), N = 10000, imports = 10, max_time = 104, seed = Sys.time() )
changes |
a dataframe holding change time points ( |
mean_latent_period |
Mean time from infection to becoming infectious (E to I), assumed to be exponentially distributed. |
mean_gen_time |
The average generation time (will be latent period+infectious duration). |
R0 |
the initial reproduction number |
fn_dBeta |
A function of time |
N |
Total population size. Defaults to 10,000. |
imports |
Initial number of infectious individuals. Defaults to 10. |
max_time |
Maximum simulation time (in days or time units). Defaults to 100. |
seed |
a random seed |
The latent period (time from infection to becoming infectious) is assumed to be exponentially distributed. The infectious period is derived from the given mean of the generation time and the latent period.
The model assumes:
Latent period ~ Exponential(sigma), where sigma = 1 / mean_latent_period
Infectious period ~ Exponential(gamma), where gamma = 1 / (mean_gen_time - mean_latent_period)
Generation time ~ Exponential(sigma) + Exponential(gamma)
beta0 = R0 * gamma
Transmission rate beta(t) = fn_dBeta(t) * beta0
The generation time is defined as the sum of the latent and infectious periods.
A dataframe containing the following columns:
statistic (character) - An identifier for the statistic, whether that be infections, admissions, deaths
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
Minimally grouped by: statistic (and other groupings allowed).
# Example: Lockdown after day 30 seir_output <- sim_seir_model( mean_latent_period = 4, mean_gen_time = 7, R0 = 2.5, fn_dBeta = function(t) ifelse(t < 30, 1, 0.5) ) if (interactive()) { plot_ip(attr(seir_output,"ip")) plot_counts(seir_output) }# Example: Lockdown after day 30 seir_output <- sim_seir_model( mean_latent_period = 4, mean_gen_time = 7, R0 = 2.5, fn_dBeta = function(t) ifelse(t < 30, 1, 0.5) ) if (interactive()) { plot_ip(attr(seir_output,"ip")) plot_counts(seir_output) }
This function converts a line list into a daily count of incident cases, plus infections, admissions, deaths, test samples, test results if present. Censoring of these counts can also be defined. Whilst summarising various network measures such as the forward looking case reproduction number are also calculated.
sim_summarise_linelist( df = i_sim_linelist, ..., censoring = list(admitted = function(t) rgamma2(t, mean = 5), death = function(t) rgamma2(t, mean = 10), sample = function(t, result_delay) result_delay), max_time = max(df$time) )sim_summarise_linelist( df = i_sim_linelist, ..., censoring = list(admitted = function(t) rgamma2(t, mean = 5), death = function(t) rgamma2(t, mean = 10), sample = function(t, result_delay) result_delay), max_time = max(df$time) )
df |
a line list dataframe arising from e.g.
Any grouping allowed. |
... |
the grouping to include in the summarisation. |
censoring |
a named list of column names (without the |
max_time |
the censoring time for this observation. If this is a vector there will be multiple time series in the output |
a count data frame with additional statistics.
A dataframe containing the following columns:
statistic (character) - An identifier for the statistic, whether that be infections, admissions, deaths
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a 'time_period'
Minimally grouped by: statistic (and other groupings may be present).
sim = sim_branching_process( changes = dplyr::tibble(t = c(0,40), R = c(1.7,0.8)), max_time = 120, seed = 100, fn_imports = ~ ifelse(.x==0,100,0) ) tmp = sim %>% sim_summarise_linelist() p1 = plot_counts(tmp) p2 = ggplot2::ggplot(tmp, ggplot2::aes(x=as.Date(time)))+ ggplot2::geom_point(ggplot2::aes(y=rt.case,colour="case"))+ ggplot2::geom_point(ggplot2::aes(y=rt.inst,colour="instantaneous"))+ ggplot2::geom_line(ggplot2::aes(y=rt.weighted))+ ggplot2::coord_cartesian(ylim=c(0,3.5))+ ggplot2::xlab(NULL) patchwork::wrap_plots(p1,p2,ncol=1,axes="collect")sim = sim_branching_process( changes = dplyr::tibble(t = c(0,40), R = c(1.7,0.8)), max_time = 120, seed = 100, fn_imports = ~ ifelse(.x==0,100,0) ) tmp = sim %>% sim_summarise_linelist() p1 = plot_counts(tmp) p2 = ggplot2::ggplot(tmp, ggplot2::aes(x=as.Date(time)))+ ggplot2::geom_point(ggplot2::aes(y=rt.case,colour="case"))+ ggplot2::geom_point(ggplot2::aes(y=rt.inst,colour="instantaneous"))+ ggplot2::geom_line(ggplot2::aes(y=rt.weighted))+ ggplot2::coord_cartesian(ylim=c(0,3.5))+ ggplot2::xlab(NULL) patchwork::wrap_plots(p1,p2,ncol=1,axes="collect")
This time-series has no statistical noise and is useful for testing things. It has a fixed known value for infections and growth rate (fixed at 0.05 and -0.05 per day), and a instantaneous reproduction number which is based on the provided infectivity profile. A fixed denominator gives a known proportion and a relative growth rate that is the same as the growth rate.
sim_test_data(ip = example_ip(), duration = 500, period = 50)sim_test_data(ip = example_ip(), duration = 500, period = 50)
ip |
an infectivity profile. any uncertainty will be collapsed into the central distribution. |
duration |
the total length of the time-series |
period |
the duration of each positive or negative growth phase |
a time series with count, incidence, growth, rt,
proportion and relative.growth columns
sim_test_data() %>% dplyr::glimpse()sim_test_data() %>% dplyr::glimpse()
Generate a single infectivity profile from multiple bootstraps
summarise_ip(ip = i_empirical_ip)summarise_ip(ip = i_empirical_ip)
ip |
the infectivity profile to summarise.
Minimally grouped by: boot (and other groupings allowed). |
an infectivity profile
This function operates on timeseries data not linelists (see time_summarise())
for line lists. If a very granular timeseries is regrouped and this function
is applied the resulting dataframe will be
time_aggregate( df = i_timestamped, ..., .groups = NULL, .cols = NULL, .fns = NULL )time_aggregate( df = i_timestamped, ..., .groups = NULL, .cols = NULL, .fns = NULL )
df |
an optionally grouped time series. Grouping should not include the time
column. The grouping works differently from |
... |
A set of |
.groups |
as per |
.cols |
Optional dplyr column specification for the data that will be
summarised. This is useful if there are lots of columns you want to
summarise using the same function (e.g. |
.fns |
Optional a set of function specifications as per |
the summarised time series preserving the time column, and with the grouping
structure involving one fewer levels than the input
example_england_covid_by_age() %>% time_aggregate(count = sum(count), denom = sum(denom)) %>% dplyr::glimpse() example_england_covid_by_age() %>% time_aggregate(.fns=mean, .cols=dplyr::where(is.numeric)) %>% dplyr::glimpse()example_england_covid_by_age() %>% time_aggregate(count = sum(count), denom = sum(denom)) %>% dplyr::glimpse() example_england_covid_by_age() %>% time_aggregate(.fns=mean, .cols=dplyr::where(is.numeric)) %>% dplyr::glimpse()
This principally is designed to take a record of single events and produce a summary time-series count of events by group, class and date. The default behaviour is to guess the cadence of the input data and summarise the event line list to a (set of) regular time-series counts for use in incidence and growth rate estimates.
time_summarise( df = i_dated, unit, anchor = "start", rectangular = FALSE, ..., .fill = list(count = 0) )time_summarise( df = i_dated, unit, anchor = "start", rectangular = FALSE, ..., .fill = list(count = 0) )
df |
a line list of data you want to summarise, optionally grouped.
If this is grouped then each group is treated independently. The remaining
columns must contain a |
unit |
a period e.g. "1 week" |
anchor |
one of a date, "start" or "end" or a weekday name e.g. "mon" this will always be one of the start of the time periods we are cutting into |
rectangular |
should the resulting time series be the same length for all groups? This is only the case if you can be sure that your data is complete for all subgroups, otherwise missing data will be treated as zero counts. This is important if leading and trailing missing data in one subgroup can be due to a reporting delay in that subgroup, in which case a rectangular time series will erroneously fill in zero counts for this missing data. |
... |
a specification for a dplyr::summary(...) - optional, and if not
provided a |
.fill |
a list similar to |
If the data is given with a class column the time series are interpreted as
having a denominator, consisting of all the different classes within a time period.
This may be subtypes (e.g. variants, serotypes) or markers for test positivity.
In either case the resulting time series will have counts for all classes and
denominators for the combination.
There is flexibility for other kinds of summarisation if the raw data is not
count based (e.g. means of continuous variables) but in this case a the slider
package is usually going to be better, as time summarise will only look at non
overlapping time periods with fixed lengths.
There is another use case where an existing timeseries on a particular
frequency is aggregated to another less frequent basis (e.g. moving from a
daily timeseries to a weekly one). In this case the input will contain a
count column. In this mode no checks are made that the more frequent events
are all present before summarisation so the result may include different numbers
of input periods (e.g. going from weeks to months may be 4 or 5 weeks in each
month). If a class column is present the a classwise denominator is calculated
The output depends on whether or not the input was grouped and
had a class column. The most detailed output will be:
A dataframe containing the following columns:
denom (positive_integer) - Total test counts associated with the specified time frame
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
Any grouping allowed.
or a more minimal output if the input is only a plain list of dated events:
A dataframe containing the following columns:
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
Any grouping allowed.
# a set of random dates with a class column: input = dplyr::tibble( class = rep(c("A","B"),1000), date = as.Date("2020-01-01")+sample.int(100,2000,TRUE) ) # summarise daily counts, including denominators: daily = time_summarise(input, unit="1 day") %>% dplyr::glimpse() # summarise weekly counts, for sample data: time_summarise(input, unit="1 week") %>% dplyr::glimpse() # summarise daily counts into weekly: time_summarise(daily, unit="1 week") %>% dplyr::glimpse()# a set of random dates with a class column: input = dplyr::tibble( class = rep(c("A","B"),1000), date = as.Date("2020-01-01")+sample.int(100,2000,TRUE) ) # summarise daily counts, including denominators: daily = time_summarise(input, unit="1 day") %>% dplyr::glimpse() # summarise weekly counts, for sample data: time_summarise(input, unit="1 week") %>% dplyr::glimpse() # summarise daily counts into weekly: time_summarise(daily, unit="1 week") %>% dplyr::glimpse()
ggoutbreak compatible time series dataframeCoerce an object to a ggoutbreak compatible time series dataframe
timeseries(x, ...) ## Default S3 method: timeseries(x, ...) ## S3 method for class 'incidence2' timeseries(x, ...) ## S3 method for class 'data.frame' timeseries( x, ..., date = NULL, count = NULL, denom = NULL, population = NULL, class = NULL, declutter = FALSE ) ## S3 method for class 'incidence' timeseries(x, ..., declutter = FALSE)timeseries(x, ...) ## Default S3 method: timeseries(x, ...) ## S3 method for class 'incidence2' timeseries(x, ...) ## S3 method for class 'data.frame' timeseries( x, ..., date = NULL, count = NULL, denom = NULL, population = NULL, class = NULL, declutter = FALSE ) ## S3 method for class 'incidence' timeseries(x, ..., declutter = FALSE)
x |
An object to coerce e.g. a data frame |
... |
Named arguments passed on to
|
date |
R expression defining a date column (as a |
count |
R expression defining a count column (as an |
denom |
R expression defining a denominator column (as an |
population |
R expression defining a population column (as an |
class |
R expression defining a class column (as an |
declutter |
logical flag. If |
minimally a case count dataframe A dataframe containing the following columns:
count (positive_integer) - Positive case counts associated with the specified time frame
time (ggoutbreak::time_period + group_unique) - A (usually complete) set of singular observations per unit time as a time_period
Any grouping allowed.
utils::data("mers_2014_15", package="EpiEstim") set_default_unit("1 day") # Use an expression to generate a case count: # N.B. complex column names need to be surrounded with bcakticks like this: tmp = mers_2014_15$incidence %>% timeseries( date = `mers$dates`, count = local+imported ) %>% dplyr::glimpse() if (interactive()) { ip = make_fixed_ip(mean = mers_2014_15$si$mean_si, sd = mers_2014_15$si$std_si) tmp %>% poisson_locfit_model(ip=ip,window=14) %>% plot_rt() } # remove columns not needed by `ggoutbreak` outbreaks::dengue_fais_2011 %>% timeseries(date = onset_date, count = value, declutter = TRUE) %>% dplyr::glimpse() # date column already exists, could use `onset` or `death` as count outbreaks::ebola_kikwit_1995 %>% timeseries(count = onset) %>% dplyr::glimpse() # This data set needs grouping to make it a unique timeseries: ukc19::ltla_cases %>% timeseries(anchor="start", unit="1 day") %>% dplyr::glimpse() # from an incidence object: if (requireNamespace("incidence",quietly = TRUE)) { onset = outbreaks::ebola_sim$linelist$date_of_onset sex = outbreaks::ebola_sim$linelist$gender inc.week.gender = incidence::incidence(onset, interval = 7, groups = sex, standard = FALSE) inc.week.gender %>% timeseries() %>% dplyr::glimpse() d = Sys.Date() + sample(-3:10, 10, replace = TRUE) di = incidence::incidence(d, interval = "week", first_date = Sys.Date() - 10, standard = TRUE) di %>% timeseries(declutter=TRUE) %>% dplyr::glimpse() } if (requireNamespace("incidence2",quietly = TRUE)) { dat = outbreaks::ebola_sim_clean$linelist x = incidence2::incidence(dat, "date_of_onset", groups = c("gender", "hospital"), interval="week") x %>% timeseries() %>% dplyr::glimpse() }utils::data("mers_2014_15", package="EpiEstim") set_default_unit("1 day") # Use an expression to generate a case count: # N.B. complex column names need to be surrounded with bcakticks like this: tmp = mers_2014_15$incidence %>% timeseries( date = `mers$dates`, count = local+imported ) %>% dplyr::glimpse() if (interactive()) { ip = make_fixed_ip(mean = mers_2014_15$si$mean_si, sd = mers_2014_15$si$std_si) tmp %>% poisson_locfit_model(ip=ip,window=14) %>% plot_rt() } # remove columns not needed by `ggoutbreak` outbreaks::dengue_fais_2011 %>% timeseries(date = onset_date, count = value, declutter = TRUE) %>% dplyr::glimpse() # date column already exists, could use `onset` or `death` as count outbreaks::ebola_kikwit_1995 %>% timeseries(count = onset) %>% dplyr::glimpse() # This data set needs grouping to make it a unique timeseries: ukc19::ltla_cases %>% timeseries(anchor="start", unit="1 day") %>% dplyr::glimpse() # from an incidence object: if (requireNamespace("incidence",quietly = TRUE)) { onset = outbreaks::ebola_sim$linelist$date_of_onset sex = outbreaks::ebola_sim$linelist$gender inc.week.gender = incidence::incidence(onset, interval = 7, groups = sex, standard = FALSE) inc.week.gender %>% timeseries() %>% dplyr::glimpse() d = Sys.Date() + sample(-3:10, 10, replace = TRUE) di = incidence::incidence(d, interval = "week", first_date = Sys.Date() - 10, standard = TRUE) di %>% timeseries(declutter=TRUE) %>% dplyr::glimpse() } if (requireNamespace("incidence2",quietly = TRUE)) { dat = outbreaks::ebola_sim_clean$linelist x = incidence2::incidence(dat, "date_of_onset", groups = c("gender", "hospital"), interval="week") x %>% timeseries() %>% dplyr::glimpse() }
Estimates the variance-covariance matrix of log-incidence estimates by modelling residual autocorrelation with an exponential decay model.
vcov_from_residuals( data, mu, sigma, max_lag = 10, min_alpha = 0.01, max_alpha = 2 )vcov_from_residuals( data, mu, sigma, max_lag = 10, min_alpha = 0.01, max_alpha = 2 )
data |
Numeric vector: observed case counts I_t |
mu |
Numeric vector: estimated log-incidence (log lambda_t) |
sigma |
Numeric vector: estimated standard error of log lambda_t |
max_lag |
Integer: maximum lag for ACF estimation (default: 10) |
min_alpha |
Positive number: minimum decay rate (default: 0.01) |
max_alpha |
Positive number: maximum decay rate (default: 2.0) |
This function was generated by Qwen3-235B-A22B-2507
List with:
vcov_matrix: T x T estimated VCOV matrix for log lambda_t
alpha: fitted decay parameter
acf_obs: observed ACF of Pearson residuals
acf_fit: fitted ACF values
residuals: Pearson residuals used
times: time indices
T <- 50 mu <- 5 + 0.05 * (1:T) + stats::arima.sim(list(ar = 0.8), n = T, sd = 0.3) sigma <- rep(0.2, T) lambda <- exp(mu) data <- stats::rpois(T, lambda) result <- vcov_from_residuals(data, mu, sigma, max_lag = 8) dim(result$vcov_matrix) # Should be T x TT <- 50 mu <- 5 + 0.05 * (1:T) + stats::arima.sim(list(ar = 0.8), n = T, sd = 0.3) sigma <- rep(0.2, T) lambda <- exp(mu) data <- stats::rpois(T, lambda) result <- vcov_from_residuals(data, mu, sigma, max_lag = 8) dim(result$vcov_matrix) # Should be T x T
This function uses a single empirical distribution for the infectivity profile / generation time. If multiple are provided then the average central value is chosen (i.e. this does not propagate uncertainty in infectivity profile)
wallinga_lipsitch( r, y = i_empirical_ip, a1 = seq(0.5, length.out = length(y)), a0 = dplyr::lag(a1, default = 0) )wallinga_lipsitch( r, y = i_empirical_ip, a1 = seq(0.5, length.out = length(y)), a0 = dplyr::lag(a1, default = 0) )
r |
a growth rate (may be a vector) |
y |
an empirical infectivity profile either as a probability vector or as a dataframe of format: A dataframe containing the following columns:
Minimally grouped by: boot (and other groupings allowed). |
a1 |
the end time of the infectivity profile probability estimate (defaults to 0.5,1.5,2.5,...). |
a0 |
the start time of the infectivity profile probability estimate (defaults to 0,0.5,1.5,...). |
a reproduction number estimate based on r
# using a probability vector. wallinga_lipsitch(r=seq(-0.1,0.1,length.out=9), y=stats::dgamma(1:50, 5,2)) # using an infectivity profile wallinga_lipsitch(r=seq(-0.1,0.1,length.out=9), y=example_ip())# using a probability vector. wallinga_lipsitch(r=seq(-0.1,0.1,length.out=9), y=stats::dgamma(1:50, 5,2)) # using an infectivity profile wallinga_lipsitch(r=seq(-0.1,0.1,length.out=9), y=example_ip())
The wedge distribution has a support of 0 to 1 and has a linear probability density function over that support.
n |
number of observations |
x |
vector of quantiles |
q |
vector of quantiles |
p |
vector of probabilities |
log |
logical; if TRUE, probabilities p are given as log(p). |
log.p |
logical; if TRUE, probabilities p are given as log(p). |
lower.tail |
logical; if TRUE (default), probabilities are |
a |
a gradient from -2 (left skewed) to 2 (right skewed) |
The rwedge can be combined with quantile functions to
skew standard distributions, or introduce correlation or down weight
certain parts of the distribution.
a vector of probabilities, quantiles, densities or samples.
pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )pwedge(seq(0,1,0.1), a=1) dwedge(seq(0,1,0.1), a=1) qwedge(c(0.25,0.5,0.75), a=-1) stats::cor( stats::qnorm(rwedge(1000, a=2)), stats::qnorm(rwedge(1000, a=-2)) )
Extract the weekday, month or quarter, or the Julian time (days since some origin). These are generic functions: the methods for the internal date-time classes are documented here.
## S3 method for class 'time_period' weekdays(x, abbreviate = FALSE)## S3 method for class 'time_period' weekdays(x, abbreviate = FALSE)
x |
an object inheriting from class |
abbreviate |
logical vector (possibly recycled). Should the names be abbreviated? |
weekdays and months return a character
vector of names in the locale in use, i.e., Sys.getlocale("LC_TIME").
quarters returns a character vector of "Q1" to
"Q4".
julian returns the number of days (possibly fractional)
since the origin, with the origin as a "origin" attribute.
All time calculations in R are done ignoring leap-seconds.
Other components such as the day of the month or the year are
very easy to compute: just use as.POSIXlt and extract
the relevant component. Alternatively (especially if the components
are desired as character strings), use strftime.
DateTimeClasses, Date;
Sys.getlocale("LC_TIME") crucially for months() and weekdays().
## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time## first two are locale dependent: weekdays(.leap.seconds) months (.leap.seconds) quarters(.leap.seconds) ## Show how easily you get month, day, year, day (of {month, week, yr}), ... : ## (remember to count from 0 (!): mon = 0..11, wday = 0..6, etc !!) ##' Transform (Time-)Date vector to convenient data frame : dt2df <- function(dt, dName = deparse(substitute(dt))) { DF <- as.data.frame(unclass(as.POSIXlt( dt ))) `names<-`(cbind(dt, DF, deparse.level=0L), c(dName, names(DF))) } ## e.g., dt2df(.leap.seconds) # date+time dt2df(Sys.Date() + 0:9) # date ##' Even simpler: Date -> Matrix - dropping time info {sec,min,hour, isdst} d2mat <- function(x) simplify2array(unclass(as.POSIXlt(x))[4:7]) ## e.g., d2mat(seq(as.Date("2000-02-02"), by=1, length.out=30)) # has R 1.0.0's release date ## Julian Day Number (JDN, https://en.wikipedia.org/wiki/Julian_day) ## is the number of days since noon UTC on the first day of 4317 BCE. ## in the proleptic Julian calendar. To more recently, in ## 'Terrestrial Time' which differs from UTC by a few seconds ## See https://en.wikipedia.org/wiki/Terrestrial_Time julian(Sys.Date(), -2440588) # from a day floor(as.numeric(julian(Sys.time())) + 2440587.5) # from a date-time