Factsheet: \(F\)-distribution
#| '!! shinylive warning !!': |
#| shinylive does not work in self-contained HTML documents.
#| Please set `embed-resources: false` in your metadata.
#| standalone: true
#| viewerHeight: 760
library(shiny)
library(bslib)
library(ggplot2)
ui <- page_fluid(
title = "F-distribution calculator",
layout_columns(
col_widths = c(4, 8),
# Left column - Inputs
card(
card_header("Parameters"),
card_body(
numericInput("df1", "Numerator degrees of freedom (d₁):", value = 5, min = 1, step = 1),
numericInput("df2", "Denominator degrees of freedom (d₂):", value = 10, min = 1, step = 1),
hr(),
radioButtons("prob_type", "Probability to calculate:",
choices = list("P(X ≤ x)" = "less",
"P(X ≥ x)" = "greater",
"P(x ≤ X ≤ y)" = "between"),
selected = "less"),
conditionalPanel(
condition = "input.prob_type == 'less'",
sliderInput("x_less", "x value:", min = 0, max = 10, value = 1, step = 0.1)
),
conditionalPanel(
condition = "input.prob_type == 'greater'",
sliderInput("x_greater", "x value:", min = 0, max = 10, value = 1, step = 0.1)
),
conditionalPanel(
condition = "input.prob_type == 'between'",
sliderInput("x_lower", "Lower bound (x):", min = 0, max = 10, value = 0.5, step = 0.1),
sliderInput("x_upper", "Upper bound (y):", min = 0, max = 10, value = 2, step = 0.1)
)
)
),
# Right column - Plot
card(
card_header("F-distribution plot"),
card_body(
uiOutput("plot_title"),
plotOutput("distPlot", height = "300px")
)
)
),
# Bottom row - Results
card(
card_header("Results"),
card_body(
textOutput("explanation")
)
)
)
server <- function(input, output, session) {
# When degrees of freedom change, adjust the range of sliders
observe({
# For F distribution, a reasonable upper limit for the x-axis depends on degrees of freedom
# Higher df means smaller values make more sense
df1 <- input$df1
df2 <- input$df2
# Use a heuristic to determine a reasonable upper bound
# This captures critical values at the 0.999 quantile
max_x <- min(qf(0.999, df1, df2), 10)
updateSliderInput(session, "x_less", max = max_x)
updateSliderInput(session, "x_greater", max = max_x)
updateSliderInput(session, "x_lower", max = max_x)
updateSliderInput(session, "x_upper", max = max_x)
})
# Ensure that x_upper is always greater than or equal to x_lower
observe({
if (input$x_upper < input$x_lower) {
updateSliderInput(session, "x_upper", value = input$x_lower)
}
})
# Display the plot title with distribution parameters
output$plot_title <- renderUI({
title <- sprintf("F-distribution(d₁ = %d, d₂ = %d)", input$df1, input$df2)
tags$h4(title, style = "text-align: center; margin-bottom: 15px;")
})
# Calculate the probability based on user selection
probability <- reactive({
if (input$prob_type == "less") {
prob <- pf(input$x_less, df1 = input$df1, df2 = input$df2)
explanation <- sprintf("P(X ≤ %.1f) = %.6f or %.4f%%",
input$x_less, prob, prob * 100)
return(list(prob = prob, explanation = explanation, type = "less", x = input$x_less))
} else if (input$prob_type == "greater") {
prob <- 1 - pf(input$x_greater, df1 = input$df1, df2 = input$df2)
explanation <- sprintf("P(X ≥ %.1f) = %.6f or %.4f%%",
input$x_greater, prob, prob * 100)
return(list(prob = prob, explanation = explanation, type = "greater", x = input$x_greater))
} else if (input$prob_type == "between") {
if (input$x_lower == input$x_upper) {
# For continuous distributions, P(X = a) = 0
prob <- 0
} else {
upper_prob <- pf(input$x_upper, df1 = input$df1, df2 = input$df2)
lower_prob <- pf(input$x_lower, df1 = input$df1, df2 = input$df2)
prob <- upper_prob - lower_prob
}
explanation <- sprintf("P(%.1f ≤ X ≤ %.1f) = %.6f or %.4f%%",
input$x_lower, input$x_upper, prob, prob * 100)
return(list(prob = prob, explanation = explanation, type = "between",
lower = input$x_lower, upper = input$x_upper))
}
})
# Display an explanation of the calculation
output$explanation <- renderText({
res <- probability()
return(res$explanation)
})
# Generate the F-distribution plot
output$distPlot <- renderPlot({
# Get parameters
df1 <- input$df1
df2 <- input$df2
# Determine a reasonable max for x-axis based on df values
max_x <- min(qf(0.999, df1, df2), 10)
# Create data frame for plotting
x_values <- seq(0.01, max_x, length.out = 500) # Avoid x=0 since df(0) is undefined
density_values <- df(x_values, df1 = df1, df2 = df2)
plot_df <- data.frame(x = x_values, density = density_values)
# Create base plot
p <- ggplot(plot_df, aes(x = x, y = density)) +
geom_line(size = 1, color = "darkgray") +
labs(x = "X", y = "probability density function") +
theme_minimal() +
theme(panel.grid.minor = element_blank()) +
xlim(0, max_x)
# Add shaded area based on selected probability type
res <- probability()
if (res$type == "less") {
# Create data for the filled area
fill_x <- seq(0.01, res$x, length.out = 200)
fill_y <- df(fill_x, df1 = df1, df2 = df2)
fill_df <- data.frame(x = fill_x, density = fill_y)
p <- p + geom_area(data = fill_df, aes(x = x, y = density),
fill = "#3F6BB6", alpha = 0.6)
} else if (res$type == "greater") {
# Create data for the filled area
fill_x <- seq(res$x, max_x, length.out = 200)
fill_y <- df(fill_x, df1 = df1, df2 = df2)
fill_df <- data.frame(x = fill_x, density = fill_y)
p <- p + geom_area(data = fill_df, aes(x = x, y = density),
fill = "#3F6BB6", alpha = 0.6)
} else if (res$type == "between") {
# Create data for the filled area
fill_x <- seq(res$lower, res$upper, length.out = 200)
fill_y <- df(fill_x, df1 = df1, df2 = df2)
fill_df <- data.frame(x = fill_x, density = fill_y)
p <- p + geom_area(data = fill_df, aes(x = x, y = density),
fill = "#3F6BB6", alpha = 0.6)
}
return(p)
})
}
shinyApp(ui = ui, server = server)
Where to use: The \(F\)-distribution is used for the ratio \((X/d_1)/(Y/d_2)\) of two independent random \(\chi^2\) variables \(X\sim \chi^2(d_1)\) and \(Y\sim \chi^2(d_2)\). It is commonly used as a reference distribution in hypothesis testing to compare two variances or more than two means, such as Analysis of Variance (ANOVA) tests.
Notation: \(X \sim F(d_{1},d_{2})\)
Parameters: Two integers \(d_1\) and \(d_2\), where - \(d_{1}\) degrees of freedom for the random variable \(X\sim \chi^2(d_{1})\). - \(d_{2}\) degrees of freedom for the random variable \(Y\sim \chi^2(d_{2})\).
Quantity | Value | Notes |
---|---|---|
Mean | \(\mathbb{E}(X) = \dfrac{d_{2}}{d_{2}-2}\) | \(d_2>2\) |
Variance | \(\mathbb{V}(X) = \dfrac{2d_{2}(d_{1}+d_{2}-2)}{d_{1}(d_{2}-2)^2(d_{2}-4)}\) | |
\(\mathbb{P}(X=x)=\dfrac{\sqrt{\frac{(d_{1}x)^{d_{1}}d_{2}^{d_{2}}}{(d_{1}x+d_{2})^{d_{1}+d_{2}}}}}{x\textrm{B}\left(\frac{d_{1}}{2},\frac{d_{2}}{2}\right)}\) | \(\textrm{B}(x,y)\) is the beta function | |
CDF | \(\mathbb{P}(X \leq x)=I_{\frac{d_{1}x}{d_{1}x+d_{2}}}(\frac{d_{1}}{2},\frac{d_{2}}{2})\) | \(I_{x}(a,b)\) is the regularized incomplete beta function |
Example: You have three independent groups of data containing Cantor’s Confectionery chocolate bar lengths, and the total sample size is 90. From this, you would like to conduct an ANOVA test investigating if there is a statistically significant difference between the means of each group. You can find the degrees of freedom using the following methods:
\(\textsf{numerator degrees of freedom = number of groups} - 1 = 3 - 1 = 2\)
\(\textsf{denominator degrees of freedom = sample size - number of groups} = 90 - 3 = 87\)
The \(F\) distribution, which will be used as a reference distribution for the ANOVA test, can be expressed as \(X \sim F(2,87)\), meaning the numerator degrees of freedom is \(2\) and the denominator degrees of freedom is \(87\).
Further reading
Version history
v1.0: initial version created 04/25 by tdhc and Michelle Arnetta as part of a University of St Andrews VIP project.
- v1.1: moved to factsheet form and populated with material from Overview: Probability distributions by tdhc.