Factsheet: \(F\)-distribution

Statistics

Author

Michelle Arnetta and Tom Coleman

Summary

A factsheet for the \(F\)-distribution.

#| '!! shinylive warning !!': |
#|   shinylive does not work in self-contained HTML documents.
#|   Please set `embed-resources: false` in your metadata.
#| standalone: true
#| viewerHeight: 760

library(shiny)
library(bslib)
library(ggplot2)

ui <- page_fluid(
  title = "F-distribution calculator",
  
  layout_columns(
    col_widths = c(4, 8),
    
    # Left column - Inputs
    card(
      card_header("Parameters"),
      card_body(
        numericInput("df1", "Numerator degrees of freedom (d₁):", value = 5, min = 1, step = 1),
        numericInput("df2", "Denominator degrees of freedom (d₂):", value = 10, min = 1, step = 1),
        hr(),
        radioButtons("prob_type", "Probability to calculate:",
                    choices = list("P(X ≤ x)" = "less", 
                                  "P(X ≥ x)" = "greater", 
                                  "P(x ≤ X ≤ y)" = "between"),
                    selected = "less"),
        conditionalPanel(
          condition = "input.prob_type == 'less'",
          sliderInput("x_less", "x value:", min = 0, max = 10, value = 1, step = 0.1)
        ),
        conditionalPanel(
          condition = "input.prob_type == 'greater'",
          sliderInput("x_greater", "x value:", min = 0, max = 10, value = 1, step = 0.1)
        ),
        conditionalPanel(
          condition = "input.prob_type == 'between'",
          sliderInput("x_lower", "Lower bound (x):", min = 0, max = 10, value = 0.5, step = 0.1),
          sliderInput("x_upper", "Upper bound (y):", min = 0, max = 10, value = 2, step = 0.1)
        )
      )
    ),
    
    # Right column - Plot
    card(
      card_header("F-distribution plot"),
      card_body(
        uiOutput("plot_title"),
        plotOutput("distPlot", height = "300px")
      )
    )
  ),
  
  # Bottom row - Results
  card(
    card_header("Results"),
    card_body(
      textOutput("explanation")
    )
  )
)

server <- function(input, output, session) {
  
  # When degrees of freedom change, adjust the range of sliders
  observe({
    # For F distribution, a reasonable upper limit for the x-axis depends on degrees of freedom
    # Higher df means smaller values make more sense
    df1 <- input$df1
    df2 <- input$df2
    
    # Use a heuristic to determine a reasonable upper bound
    # This captures critical values at the 0.999 quantile
    max_x <- min(qf(0.999, df1, df2), 10)
    
    updateSliderInput(session, "x_less", max = max_x)
    updateSliderInput(session, "x_greater", max = max_x)
    updateSliderInput(session, "x_lower", max = max_x)
    updateSliderInput(session, "x_upper", max = max_x)
  })
  
  # Ensure that x_upper is always greater than or equal to x_lower
  observe({
    if (input$x_upper < input$x_lower) {
      updateSliderInput(session, "x_upper", value = input$x_lower)
    }
  })
  
  # Display the plot title with distribution parameters
  output$plot_title <- renderUI({
    title <- sprintf("F-distribution(d₁ = %d, d₂ = %d)", input$df1, input$df2)
    tags$h4(title, style = "text-align: center; margin-bottom: 15px;")
  })
  
  # Calculate the probability based on user selection
  probability <- reactive({
    if (input$prob_type == "less") {
      prob <- pf(input$x_less, df1 = input$df1, df2 = input$df2)
      explanation <- sprintf("P(X ≤ %.1f) = %.6f or %.4f%%", 
                           input$x_less, prob, prob * 100)
      return(list(prob = prob, explanation = explanation, type = "less", x = input$x_less))
      
    } else if (input$prob_type == "greater") {
      prob <- 1 - pf(input$x_greater, df1 = input$df1, df2 = input$df2)
      explanation <- sprintf("P(X ≥ %.1f) = %.6f or %.4f%%", 
                           input$x_greater, prob, prob * 100)
      return(list(prob = prob, explanation = explanation, type = "greater", x = input$x_greater))
      
    } else if (input$prob_type == "between") {
      if (input$x_lower == input$x_upper) {
        # For continuous distributions, P(X = a) = 0
        prob <- 0
      } else {
        upper_prob <- pf(input$x_upper, df1 = input$df1, df2 = input$df2)
        lower_prob <- pf(input$x_lower, df1 = input$df1, df2 = input$df2)
        prob <- upper_prob - lower_prob
      }
      explanation <- sprintf("P(%.1f ≤ X ≤ %.1f) = %.6f or %.4f%%", 
                           input$x_lower, input$x_upper, prob, prob * 100)
      return(list(prob = prob, explanation = explanation, type = "between", 
                 lower = input$x_lower, upper = input$x_upper))
    }
  })
  
  # Display an explanation of the calculation
  output$explanation <- renderText({
    res <- probability()
    return(res$explanation)
  })
  
  # Generate the F-distribution plot
  output$distPlot <- renderPlot({
    # Get parameters
    df1 <- input$df1
    df2 <- input$df2
    
    # Determine a reasonable max for x-axis based on df values
    max_x <- min(qf(0.999, df1, df2), 10)
    
    # Create data frame for plotting
    x_values <- seq(0.01, max_x, length.out = 500)  # Avoid x=0 since df(0) is undefined
    density_values <- df(x_values, df1 = df1, df2 = df2)
    plot_df <- data.frame(x = x_values, density = density_values)
    
    # Create base plot
    p <- ggplot(plot_df, aes(x = x, y = density)) +
      geom_line(size = 1, color = "darkgray") +
      labs(x = "X", y = "probability density function") +
      theme_minimal() +
      theme(panel.grid.minor = element_blank()) +
      xlim(0, max_x)
    
    # Add shaded area based on selected probability type
    res <- probability()
    
    if (res$type == "less") {
      # Create data for the filled area
      fill_x <- seq(0.01, res$x, length.out = 200)
      fill_y <- df(fill_x, df1 = df1, df2 = df2)
      fill_df <- data.frame(x = fill_x, density = fill_y)
      
      p <- p + geom_area(data = fill_df, aes(x = x, y = density), 
                        fill = "#3F6BB6", alpha = 0.6)
      
    } else if (res$type == "greater") {
      # Create data for the filled area
      fill_x <- seq(res$x, max_x, length.out = 200)
      fill_y <- df(fill_x, df1 = df1, df2 = df2)
      fill_df <- data.frame(x = fill_x, density = fill_y)
      
      p <- p + geom_area(data = fill_df, aes(x = x, y = density), 
                        fill = "#3F6BB6", alpha = 0.6)
      
    } else if (res$type == "between") {
      # Create data for the filled area
      fill_x <- seq(res$lower, res$upper, length.out = 200)
      fill_y <- df(fill_x, df1 = df1, df2 = df2)
      fill_df <- data.frame(x = fill_x, density = fill_y)
      
      p <- p + geom_area(data = fill_df, aes(x = x, y = density), 
                        fill = "#3F6BB6", alpha = 0.6)
    }
    
    return(p)
  })
}

shinyApp(ui = ui, server = server)

Where to use: The \(F\)-distribution is used for the ratio \((X/d_1)/(Y/d_2)\) of two independent random \(\chi^2\) variables \(X\sim \chi^2(d_1)\) and \(Y\sim \chi^2(d_2)\). It is commonly used as a reference distribution in hypothesis testing to compare two variances or more than two means, such as Analysis of Variance (ANOVA) tests.

Notation: \(X \sim F(d_{1},d_{2})\)

Parameters: Two integers \(d_1\) and \(d_2\), where - \(d_{1}\) degrees of freedom for the random variable \(X\sim \chi^2(d_{1})\). - \(d_{2}\) degrees of freedom for the random variable \(Y\sim \chi^2(d_{2})\).

Quantity	Value	Notes
Mean	\(\mathbb{E}(X) = \dfrac{d_{2}}{d_{2}-2}\)	\(d_2>2\)
Variance	\(\mathbb{V}(X) = \dfrac{2d_{2}(d_{1}+d_{2}-2)}{d_{1}(d_{2}-2)^2(d_{2}-4)}\)
PDF	\(\mathbb{P}(X=x)=\dfrac{\sqrt{\frac{(d_{1}x)^{d_{1}}d_{2}^{d_{2}}}{(d_{1}x+d_{2})^{d_{1}+d_{2}}}}}{x\textrm{B}\left(\frac{d_{1}}{2},\frac{d_{2}}{2}\right)}\)	\(\textrm{B}(x,y)\) is the beta function
CDF	\(\mathbb{P}(X \leq x)=I_{\frac{d_{1}x}{d_{1}x+d_{2}}}(\frac{d_{1}}{2},\frac{d_{2}}{2})\)	\(I_{x}(a,b)\) is the regularized incomplete beta function

Example: You have three independent groups of data containing Cantor’s Confectionery chocolate bar lengths, and the total sample size is 90. From this, you would like to conduct an ANOVA test investigating if there is a statistically significant difference between the means of each group. You can find the degrees of freedom using the following methods:

\(\textsf{numerator degrees of freedom = number of groups} - 1 = 3 - 1 = 2\)
\(\textsf{denominator degrees of freedom = sample size - number of groups} = 90 - 3 = 87\)

The \(F\) distribution, which will be used as a reference distribution for the ANOVA test, can be expressed as \(X \sim F(2,87)\), meaning the numerator degrees of freedom is \(2\) and the denominator degrees of freedom is \(87\).

Factsheet: \(F\)-distribution

Further reading

Version history

Mailing List

Feedback