Skip to contents

Introduction

This vignette demonstrates how to use thoth for a typical RNA-seq analysis workflow using edgeR and limma. We’ll walk through:

  1. Setting up a reproducible project structure
  2. Tracking raw count data with DVC
  3. Performing quality control with documented decisions
  4. Running differential expression analysis
  5. Creating publication-ready visualizations
  6. Building a reproducible pipeline

Project Setup

First, let’s create a new project with all reproducibility features enabled:

# Load required packages
library(thoth)
library(tidyverse)
library(edgeR)
library(limma)
library(statmod)  # Required for edgeR
library(ComplexHeatmap)

# Create new project with reproducibility features
create_analytics_project(
  "rnaseq_analysis",
  use_dvc = TRUE,      # Enable data version control
  use_docker = TRUE,   # Enable containerization
  git_init = TRUE      # Initialize Git repository
)

# Change to project directory
setwd("rnaseq_analysis")

Data Management

Generate Example Dataset

For this example, we’ll create a simulated RNA-seq dataset:

# Set random seed for reproducibility
set.seed(42)

# Simulation parameters
n_genes <- 10000
n_samples <- 8

# Create counts matrix with negative binomial distribution
counts <- matrix(
  rnbinom(n_genes * n_samples, mu = 100, size = 1),
  nrow = n_genes,
  ncol = n_samples
)

# Add informative row and column names
rownames(counts) <- paste0("gene_", 1:n_genes)
colnames(counts) <- paste0("sample_", 1:n_samples)

# Create sample metadata with treatment and batch information
sample_info <- data.frame(
  sample = colnames(counts),
  group = rep(c("control", "treatment"), each = 4),
  batch = rep(c("A", "B"), 4)
)

Track Raw Data with DVC

Save and version our raw data:

# Save and track count matrix
counts |>
  as.data.frame() |>
  rownames_to_column("gene_id") |>
  write_csv_dvc(
    "data/raw/counts.csv",
    message = "Add raw RNA-seq count data",
    stage_name = "save_raw_data"
  )

# Save and track sample metadata
sample_info |>
  write_csv_dvc(
    "data/raw/sample_info.csv",
    message = "Add sample metadata",
    stage_name = "save_metadata"
  )

Analysis Setup

Initialize Decision Tree

Create a structured way to track our analytical decisions:

# Initialize decision tracking
decision_file <- initialize_decision_tree(
  analysis_id = "rnaseq_2024",
  analyst = "Data Scientist",
  description = "Differential expression analysis comparing treatment vs control"
)

Create DGEList Object

Set up our edgeR analysis object:

# Read data using tracked files
counts_df <- read_csv("data/raw/counts.csv")
sample_info <- read_csv("data/raw/sample_info.csv")

# Create DGEList object
dge <- DGEList(
  counts = counts_df |> column_to_rownames("gene_id"),
  group = sample_info$group
)

# Add batch information for later use
dge$samples$batch <- sample_info$batch

# Record initial data structure
record_decision(
  decision_file,
  check = "Data import",
  observation = sprintf(
    "Dataset contains %d genes across %d samples",
    nrow(dge), ncol(dge)
  ),
  decision = "Proceed with analysis",
  reasoning = "Data structure matches experimental design",
  evidence = NULL
)

Quality Control

1. Expression Filtering

Remove lowly expressed genes:

# Calculate library sizes and CPM
lib_sizes <- dge$samples$lib.size
cpms <- cpm(dge)

# Create expression density plot
png("plots/gene_expression_density.png", width = 800, height = 600)
plot(density(log2(cpms[cpms > 0])), 
     main = "Gene Expression Distribution",
     xlab = "log2 CPM")
dev.off()

# Filter low expression genes
keep <- filterByExpr(dge, group = dge$samples$group)
dge_filtered <- dge[keep, ]

# Record filtering decision
record_decision(
  decision_file,
  check = "Gene filtering",
  observation = sprintf(
    "Removed %d genes (%d%%) with consistently low counts",
    sum(!keep), round(100 * sum(!keep) / length(keep))
  ),
  decision = "Filter using filterByExpr()",
  reasoning = "Remove noise from lowly expressed genes",
  evidence = "plots/gene_expression_density.png"
)

# Save filtered data with DVC
dge_filtered$counts |>
  as.data.frame() |>
  rownames_to_column("gene_id") |>
  write_csv_dvc(
    "data/processed/filtered_counts.csv",
    message = "Add filtered count data",
    stage_name = "filter_genes",
    deps = c(
      "data/raw/counts.csv",
      "data/raw/sample_info.csv"
    ),
    params = list(
      min_cpm = 1,
      min_samples = 4
    )
  )

2. Normalization

Apply TMM normalization to account for composition bias:

# Calculate normalization factors
dge_filtered <- calcNormFactors(dge_filtered)

# Record normalization decision
record_decision(
  decision_file,
  check = "Normalization",
  observation = sprintf(
    "Library sizes range from %s to %s million reads",
    format(min(lib_sizes) / 1e6, digits = 2),
    format(max(lib_sizes) / 1e6, digits = 2)
  ),
  decision = "Apply TMM normalization",
  reasoning = "Account for composition bias between samples",
  evidence = NULL
)

3. Sample Quality Assessment

Visualize sample relationships:

# Create MDS plot
png("plots/mds_plot.png", width = 800, height = 600)
limma::plotMDS(dge_filtered,
               col = as.numeric(factor(dge_filtered$samples$group)),
               pch = as.numeric(factor(dge_filtered$samples$batch)))
legend("topright",
       legend = c(levels(factor(dge_filtered$samples$group)),
                 levels(factor(dge_filtered$samples$batch))),
       col = c(1:2, rep("black", 2)),
       pch = c(1, 1, 1:2))
dev.off()

# Record clustering observation
record_decision(
  decision_file,
  check = "Sample clustering",
  observation = "Samples cluster primarily by treatment with visible batch effects",
  decision = "Include batch in design matrix",
  reasoning = "Account for technical variation while testing treatment effect",
  evidence = "plots/mds_plot.png"
)

Differential Expression Analysis

1. Model Fitting

Fit linear model accounting for batch effects:

# Create design matrix with batch effect
design <- model.matrix(
  ~batch + group,
  data = dge_filtered$samples
)

# Apply voom transformation and fit model
v <- voom(dge_filtered, design, plot = TRUE)
png("plots/voom_plot.png", width = 800, height = 600)
v <- voom(dge_filtered, design, plot = TRUE)
dev.off()

# Fit linear model
fit <- lmFit(v, design)
fit <- eBayes(fit)

# Get results table
results <- topTable(
  fit,
  coef = "grouptreatment",
  number = Inf
) |>
  rownames_to_column("gene_id")

# Save results with DVC
results |>
  write_csv_dvc(
    "data/processed/de_results.csv",
    message = "Add differential expression results",
    stage_name = "de_analysis",
    deps = "data/processed/filtered_counts.csv",
    params = list(
      adj_p_threshold = 0.05,
      lfc_threshold = 1
    ),
    metrics = TRUE  # Track as DVC metrics
  )

# Record analysis decisions
record_decision(
  decision_file,
  check = "Differential expression",
  observation = sprintf(
    "Found %d DE genes (FDR < 0.05, |logFC| > 1)",
    sum(results$adj.P.Val < 0.05 & abs(results$logFC) > 1)
  ),
  decision = "Use voom-limma pipeline with batch correction",
  reasoning = "Account for mean-variance relationship and batch effects",
  evidence = c("plots/voom_plot.png", "data/processed/de_results.csv")
)

2. Visualization

Create publication-ready figures:

# Create volcano plot
results |>
  ggplot(aes(x = logFC, y = -log10(adj.P.Val))) +
  geom_point(
    aes(color = adj.P.Val < 0.05 & abs(logFC) > 1),
    alpha = 0.6
  ) +
  scale_color_manual(
    values = c("grey", "red"),
    labels = c("Non-significant", "DE genes")
  ) +
  theme_minimal() +
  labs(
    title = "Differential Expression Analysis",
    x = "log2 Fold Change",
    y = "-log10 Adjusted P-value",
    color = "Significance"
  ) |>
  ggsave(
    "plots/volcano_plot.png",
    width = 10,
    height = 8
  )

# Create heatmap of top DE genes
top_genes <- results |>
  filter(adj.P.Val < 0.05, abs(logFC) > 1) |>
  slice_head(n = 50) |>
  pull(gene_id)

# Get normalized expression for top genes
expr_mat <- v$E[top_genes, ]

# Save heatmap
png("plots/heatmap.png", width = 800, height = 1000)
Heatmap(
  expr_mat,
  name = "Expression",
  column_split = dge_filtered$samples$group,
  show_row_names = FALSE,
  column_title = "Top 50 DE Genes"
)
dev.off()

Export Analysis Documentation

Generate comprehensive documentation:

# Export decision tree
export_decision_tree(
  decision_file,
  format = "html",
  output_path = "reports/analysis_decisions.html"
)

# Generate methods section
methods_text <- generate_methods_section(decision_file)
writeLines(methods_text, "reports/methods_section.md")

Complete DVC Pipeline

Our analysis is now fully tracked and reproducible:

# Check pipeline status
dvc_status()

# Reproduce entire analysis
dvc_repro()

# Push to remote storage
dvc_push()

Conclusion

This example demonstrates how to: - Track RNA-seq data with DVC - Document QC and analysis decisions - Create reproducible bioinformatics pipelines - Generate publication-ready figures - Share results and methods

The entire analysis is version controlled and reproducible, with: - Raw data tracked by DVC - Code in Git - Dependencies managed by renv - Analysis decisions documented - Docker environment for reproducibility

Next Steps