Visualize the Influence of Outliers
code
intermediate
r
If you’re looking at linear relationships, give this code a spin. It’ll automatically flag outliers based on 1.5*IQR and show you the fit with and without the outliers. Since the output is a ggplot, you and add to it or tweak the aesthetics of the output (see example below).
Here’s the function.
<- function(temp = filter(M, Time == "Baseline"),
scatter_w_wo_outliers X = "bkkca",
Y = "Ihtk.0"){
# flag outliers based on 1.5xIQR from median
<- median(temp[[X]], na.rm = T) - (1.5 * IQR(temp[[X]], na.rm = T))
X_low <- median(temp[[X]], na.rm = T) + (1.5 * IQR(temp[[X]], na.rm = T))
X_high <- median(temp[[Y]], na.rm = T) - (1.5 * IQR(temp[[Y]], na.rm = T))
Y_low <- median(temp[[Y]], na.rm = T) + (1.5 * IQR(temp[[Y]], na.rm = T))
Y_high
<- (temp[[X]] > X_low) * (temp[[X]] < X_high)
X_pass <- (temp[[Y]] > Y_low) * (temp[[Y]] < Y_high)
Y_pass
$flag <- ifelse((X_pass * Y_pass) == 1, T, F)
temp
# Duplicate so we have dataset 1, 2 (introduces duplicates)
<- rbind(temp[temp$flag == T, ], mutate(temp, flag = F))
temp
<- y ~ x
formula1
<- ggplot(temp, aes_string(X, Y, color = "flag"))+
plt geom_smooth(data = temp, method = lm, se = F, fullrange = T)+
geom_point(data = temp)+
geom_point(data = temp, color = "black", shape = 1)+
geom_point(data = temp[temp$flag, ])+
::stat_poly_eq(aes(label = paste(stat(eq.label), "*" with "*",
ggpmiscstat(rr.label), "*", "*",
stat(f.value.label), "*", and "*",
stat(p.value.label), "*"."",
sep = "")),
formula = formula1, parse = TRUE, size = 4)+
scale_color_manual(values = c("darkgray", "black"))+
theme_bw()+
theme(legend.position = "")
return(plt)
}
Here’s a reproducible example. We’re creating a Simpson’s paradox by giving the “outliers” a negative slope and the real data a positive slope. I’ve added a red line showing the true relationship.
set.seed(45645684)
<- data.frame(x = rnorm(30, mean = 10, sd = 4),
df noise = runif(30, min = -2, max = 2),
y = NA,
is_outlier = rbinom(30, 1, prob = 0.2))
$y <- ifelse(df$is_outlier,
df-5*df$x+df$noise,
2*df$x+df$noise)
scatter_w_wo_outliers(temp = df,
X = "x",
Y = "y")+
geom_abline(slope = 2, intercept = 0, color = "firebrick")
2020-6-4 Daniel Here’s a related utility function. For a given column it’ll return a logical vector where outliers are FALSE
.
<- function(col_in, multiplier = 1.5){
w_in_x_iqr <- as.vector(col_in)
col_in
<- median(col_in, na.rm = T) - (multiplier * IQR(col_in, na.rm = T))
X_low <- median(col_in, na.rm = T) + (multiplier * IQR(col_in, na.rm = T))
X_high <- (col_in > X_low) * (col_in < X_high)
X_pass
return(as.logical(X_pass))
}
e.g.
> mutate(M, ex = w_in_x_iqr(bkkca)) %>% select(bkkca, ex) %>% arrange(bkkca) %>% tail()
# A tibble: 6 x 2
# bkkca ex
# <dbl> <lgl>
# 1 3056. TRUE
# 2 3222. TRUE
# 3 3255. TRUE # Within bounds
# 4 3552. FALSE # Outside bounds
# 5 3817. FALSE
# 6 6740. FALSE