misclass_prob = matrix(c(
.80, .10, .15,
.15, .80, .15,
.05, .10, .70
), nrow = 3, byrow = TRUE,
dimnames = list(LETTERS[1:3], LETTERS[1:3]))
misclass_prob A B C
A 0.80 0.1 0.15
B 0.15 0.8 0.15
C 0.05 0.1 0.70
Data Simulation with Monte Carlo Methods
University of Hohenheim
Change the first simulation so that the misclassification probabilities to allow for different patterns accuracies and errors: varying difficulties of the categories, systematic confusion of some categories.
Investigate the consequences of misclassification error for an analysis of two binary variables which were both measured with misclassification error.
A simple example with systematic confusion for illustrative purposes
Misclassification matrix as argument
sim_misclass = function(pop_dist = c(0.55, 0.3, 0.1, 0.05),
n = 1000,
misclass_prob = misclass_prob) {
# Population
k = length(pop_dist)
categories = LETTERS[1:k]
# Sample
sample_true = sample(x = categories,
size = n,
replace = TRUE,
prob = pop_dist) %>%
factor(levels = categories) %>%
sort()
freq_true = sample_true %>%
table() %>%
prop.table()
# Misclassification
sample_obs = table(sample_true) %>%
imap(~ sample(categories,
size = .x, replace = TRUE,
prob = misclass_prob[, .y])) %>%
unlist(use.names = FALSE) %>%
factor(levels = categories)
freq_obs = sample_obs %>%
table() %>%
prop.table()
# Error summary
rmse = sqrt(mean((pop_dist - freq_obs)^2))
# Output
out = lst(freq_true, freq_obs, rmse)
out
}sims %>%
ungroup() %>%
unnest_wider(res) %>%
unnest_longer(freq_obs, indices_to = "category") %>%
group_by(category) %>%
summarise(Q = list(quantile(freq_obs, probs = c(0.25, 0.5, 0.75)))) %>%
unnest_wider(Q) %>%
ggplot(aes(`50%`, fct_rev(category),
xmin = `25%`, xmax = `75%`)) +
geom_pointrange() +
geom_vline(xintercept = unlist(conditions$pop_dist), color = "red", linetype = 2) +
labs(x = str_glue("Median and IQR of the proportions from {i} simulation runs per condition.\nRed lines are shares in the population."),
y = "Category")This could be done with a close adaption of the second simulation, i.e., using the sample() function for data generation and misclassification.
However, simulation with binary variables is much simpler and more efficient with rbinom().
The following solution shows the latter option.
pop_baseline: Probability of the outcome in the baseline group, i.e., sample_x_true == 0
pop_difference: Difference between probabilities of the outcome in the baseline and treatment groups
logit_pop_baseline: logit (i.e., \(log(x / (1 - x)\)) of the baseline probability
pop_oddsratio: Odds ratio of the outcome in the baseline and treatment groups
pop_logodds: log(odds)
outcomes_true: Draw outcome n times from a binomial distribution, where the success probability is a function of the baseline probability and the log odds of the probabilities in the baseline and treatment groups
pop_baseline = 0.5
pop_difference = 0.1
logit_pop_baseline = log(pop_baseline / (1 - pop_baseline))
pop_oddsratio = ((pop_baseline + pop_difference) / (1 - (pop_baseline + pop_difference))) /
(pop_baseline / (1 - pop_baseline))
pop_logodds = log(pop_oddsratio)
outcomes_true = rbinom(n = n, size = 1,
prob = 1 / (1 + exp(- (logit_pop_baseline + pop_logodds * sample_x_true))))
prop.table(table(outcomes_true, sample_x_true), margin = 2) %>% round(2) sample_x_true
outcomes_true 0 1
0 0.53 0.40
1 0.47 0.60
Each observation is classified correctly with probability of accuracy, regardless of its true value.
correct_ are vectors of length n which record whether an observation was misclassified (1) or not (0).
abs(_true - correct_) switches the values for all observations which are noted as misclassified, correct_ == 1.
# Misclassification probabilities
accuracy = 0.8
correct_x = rbinom(n = n, size = 1,
prob = (1 - accuracy))
correct_outcome = rbinom(n = n, size = 1,
prob = (1 - accuracy))
# Misclassification
sample_x_obs = abs(sample_x_true - correct_x)
outcomes_obs = abs(outcomes_true - correct_outcome)
mean(sample_x_obs == sample_x_true); mean(outcomes_obs == outcomes_true)[1] 0.797
[1] 0.801
We use Welch’s t-test to estimate the differences in the observed outcome between the observed groups.
We extract the p-value, p_obs, for computing statistical power.
We extract the estimated absolute difference, diff_obs, and its ratio to the true population difference, diff_obs_rel.
sim_misclass_binary = function(pop_dist_x = 0.5,
pop_baseline = 0.5,
pop_difference = 0.1,
n = 1000,
accuracy = 0.8) {
# Population
logit_pop_baseline = log(pop_baseline / (1 - pop_baseline))
pop_oddsratio = ((pop_baseline + pop_difference) / (1 - (pop_baseline + pop_difference))) /
(pop_baseline / (1 - pop_baseline))
pop_logodds = log(pop_oddsratio)
# True sample
sample_x_true = rbinom(n = n,
size = 1,
prob = pop_dist_x)
outcomes_true = rbinom(n = n, size = 1,
prob = 1 / (1 + exp(- (logit_pop_baseline + pop_logodds * sample_x_true))))
# Misclassification probabilities
correct_x = rbinom(n = n, size = 1, prob = (1 - accuracy))
correct_outcome = rbinom(n = n, size = 1, prob = (1 - accuracy))
# Misclassification
sample_x_obs = abs(sample_x_true - correct_x)
outcomes_obs = abs(outcomes_true - correct_outcome)
# Model
t_res = t.test(outcomes_obs ~ sample_x_obs)
p_obs = t_res$p.value
diff_obs = unname(diff(t_res$estimate))
diff_obs_rel = diff_obs / pop_difference
# Output
out = lst(p_obs, diff_obs, diff_obs_rel)
return(out)
}pop_dist_x, the prevalence of the outcome in the baseline group, pop_baseline, and, most importantly, the accuracy of the measurements.conditions = expand_grid(
pop_dist_x = c(.1, .3, .5),
pop_baseline = c(.1, .3, .5),
pop_difference = 0.15,
n = 1000,
accuracy = seq(from = 0.6, to = 1, by = 0.1)
) %>%
rowid_to_column(var = "condition")
conditions# A tibble: 45 × 6
condition pop_dist_x pop_baseline pop_difference n accuracy
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.1 0.1 0.15 1000 0.6
2 2 0.1 0.1 0.15 1000 0.7
3 3 0.1 0.1 0.15 1000 0.8
4 4 0.1 0.1 0.15 1000 0.9
5 5 0.1 0.1 0.15 1000 1
6 6 0.1 0.3 0.15 1000 0.6
7 7 0.1 0.3 0.15 1000 0.7
8 8 0.1 0.3 0.15 1000 0.8
9 9 0.1 0.3 0.15 1000 0.9
10 10 0.1 0.3 0.15 1000 1
# … with 35 more rows
i = 1000 # less simulations to save some workshop time
set.seed(40)
tic()
sims = map_dfr(1:i, ~ conditions) %>%
rowid_to_column(var = "sim") %>%
rowwise() %>%
mutate(res = list(sim_misclass_binary(pop_dist_x = pop_dist_x,
pop_baseline = pop_baseline,
pop_difference = pop_difference,
n = n, accuracy = accuracy)))
toc()43.462 sec elapsed
sims %>%
ungroup() %>%
unnest_wider(res) %>%
group_by(accuracy, pop_baseline, pop_dist_x) %>%
summarise(Q = list(quantile(diff_obs_rel, probs = c(0.25, 0.5, 0.75)))) %>%
unnest_wider(Q) %>%
ggplot(aes(`50%`, factor(accuracy), color = factor(pop_dist_x),
xmin = `25%`, xmax = `75%`)) +
geom_pointrange(position = position_dodge(width = -.2)) +
facet_wrap(vars(pop_baseline), labeller = "label_both") +
labs(y = "accuracy",
x = str_glue("Median and IQR of the ratio true by observed difference\nfrom {i} simulation runs per condition."),
color = "Population\ntreatment\nprevalence")sims %>%
ungroup() %>%
unnest_wider(res) %>%
group_by(accuracy, pop_baseline, pop_dist_x) %>%
summarise(P_p05 = mean(p_obs < 0.05)) %>%
ggplot(aes(accuracy, P_p05, color = factor(pop_dist_x))) +
geom_point() + geom_line() +
facet_wrap(vars(pop_baseline), labeller = "label_both") +
labs(y = "Power at p < .05", color = "Population\ntreatment\nprevalence")