Introduction
This vignette demonstrates how to use thoth
for a
typical RNA-seq analysis workflow using edgeR and limma. We’ll walk
through:
- Setting up a reproducible project structure
- Tracking raw count data with DVC
- Performing quality control with documented decisions
- Running differential expression analysis
- Creating publication-ready visualizations
- Building a reproducible pipeline
Project Setup
First, let’s create a new project with all reproducibility features enabled:
# Load required packages
library(thoth)
library(tidyverse)
library(edgeR)
library(limma)
library(statmod) # Required for edgeR
library(ComplexHeatmap)
# Create new project with reproducibility features
create_analytics_project(
"rnaseq_analysis",
use_dvc = TRUE, # Enable data version control
use_docker = TRUE, # Enable containerization
git_init = TRUE # Initialize Git repository
)
# Change to project directory
setwd("rnaseq_analysis")
Data Management
Generate Example Dataset
For this example, we’ll create a simulated RNA-seq dataset:
# Set random seed for reproducibility
set.seed(42)
# Simulation parameters
n_genes <- 10000
n_samples <- 8
# Create counts matrix with negative binomial distribution
counts <- matrix(
rnbinom(n_genes * n_samples, mu = 100, size = 1),
nrow = n_genes,
ncol = n_samples
)
# Add informative row and column names
rownames(counts) <- paste0("gene_", 1:n_genes)
colnames(counts) <- paste0("sample_", 1:n_samples)
# Create sample metadata with treatment and batch information
sample_info <- data.frame(
sample = colnames(counts),
group = rep(c("control", "treatment"), each = 4),
batch = rep(c("A", "B"), 4)
)
Track Raw Data with DVC
Save and version our raw data:
# Save and track count matrix
counts |>
as.data.frame() |>
rownames_to_column("gene_id") |>
write_csv_dvc(
"data/raw/counts.csv",
message = "Add raw RNA-seq count data",
stage_name = "save_raw_data"
)
# Save and track sample metadata
sample_info |>
write_csv_dvc(
"data/raw/sample_info.csv",
message = "Add sample metadata",
stage_name = "save_metadata"
)
Analysis Setup
Initialize Decision Tree
Create a structured way to track our analytical decisions:
# Initialize decision tracking
decision_file <- initialize_decision_tree(
analysis_id = "rnaseq_2024",
analyst = "Data Scientist",
description = "Differential expression analysis comparing treatment vs control"
)
Create DGEList Object
Set up our edgeR analysis object:
# Read data using tracked files
counts_df <- read_csv("data/raw/counts.csv")
sample_info <- read_csv("data/raw/sample_info.csv")
# Create DGEList object
dge <- DGEList(
counts = counts_df |> column_to_rownames("gene_id"),
group = sample_info$group
)
# Add batch information for later use
dge$samples$batch <- sample_info$batch
# Record initial data structure
record_decision(
decision_file,
check = "Data import",
observation = sprintf(
"Dataset contains %d genes across %d samples",
nrow(dge), ncol(dge)
),
decision = "Proceed with analysis",
reasoning = "Data structure matches experimental design",
evidence = NULL
)
Quality Control
1. Expression Filtering
Remove lowly expressed genes:
# Calculate library sizes and CPM
lib_sizes <- dge$samples$lib.size
cpms <- cpm(dge)
# Create expression density plot
png("plots/gene_expression_density.png", width = 800, height = 600)
plot(density(log2(cpms[cpms > 0])),
main = "Gene Expression Distribution",
xlab = "log2 CPM")
dev.off()
# Filter low expression genes
keep <- filterByExpr(dge, group = dge$samples$group)
dge_filtered <- dge[keep, ]
# Record filtering decision
record_decision(
decision_file,
check = "Gene filtering",
observation = sprintf(
"Removed %d genes (%d%%) with consistently low counts",
sum(!keep), round(100 * sum(!keep) / length(keep))
),
decision = "Filter using filterByExpr()",
reasoning = "Remove noise from lowly expressed genes",
evidence = "plots/gene_expression_density.png"
)
# Save filtered data with DVC
dge_filtered$counts |>
as.data.frame() |>
rownames_to_column("gene_id") |>
write_csv_dvc(
"data/processed/filtered_counts.csv",
message = "Add filtered count data",
stage_name = "filter_genes",
deps = c(
"data/raw/counts.csv",
"data/raw/sample_info.csv"
),
params = list(
min_cpm = 1,
min_samples = 4
)
)
2. Normalization
Apply TMM normalization to account for composition bias:
# Calculate normalization factors
dge_filtered <- calcNormFactors(dge_filtered)
# Record normalization decision
record_decision(
decision_file,
check = "Normalization",
observation = sprintf(
"Library sizes range from %s to %s million reads",
format(min(lib_sizes) / 1e6, digits = 2),
format(max(lib_sizes) / 1e6, digits = 2)
),
decision = "Apply TMM normalization",
reasoning = "Account for composition bias between samples",
evidence = NULL
)
3. Sample Quality Assessment
Visualize sample relationships:
# Create MDS plot
png("plots/mds_plot.png", width = 800, height = 600)
limma::plotMDS(dge_filtered,
col = as.numeric(factor(dge_filtered$samples$group)),
pch = as.numeric(factor(dge_filtered$samples$batch)))
legend("topright",
legend = c(levels(factor(dge_filtered$samples$group)),
levels(factor(dge_filtered$samples$batch))),
col = c(1:2, rep("black", 2)),
pch = c(1, 1, 1:2))
dev.off()
# Record clustering observation
record_decision(
decision_file,
check = "Sample clustering",
observation = "Samples cluster primarily by treatment with visible batch effects",
decision = "Include batch in design matrix",
reasoning = "Account for technical variation while testing treatment effect",
evidence = "plots/mds_plot.png"
)
Differential Expression Analysis
1. Model Fitting
Fit linear model accounting for batch effects:
# Create design matrix with batch effect
design <- model.matrix(
~batch + group,
data = dge_filtered$samples
)
# Apply voom transformation and fit model
v <- voom(dge_filtered, design, plot = TRUE)
png("plots/voom_plot.png", width = 800, height = 600)
v <- voom(dge_filtered, design, plot = TRUE)
dev.off()
# Fit linear model
fit <- lmFit(v, design)
fit <- eBayes(fit)
# Get results table
results <- topTable(
fit,
coef = "grouptreatment",
number = Inf
) |>
rownames_to_column("gene_id")
# Save results with DVC
results |>
write_csv_dvc(
"data/processed/de_results.csv",
message = "Add differential expression results",
stage_name = "de_analysis",
deps = "data/processed/filtered_counts.csv",
params = list(
adj_p_threshold = 0.05,
lfc_threshold = 1
),
metrics = TRUE # Track as DVC metrics
)
# Record analysis decisions
record_decision(
decision_file,
check = "Differential expression",
observation = sprintf(
"Found %d DE genes (FDR < 0.05, |logFC| > 1)",
sum(results$adj.P.Val < 0.05 & abs(results$logFC) > 1)
),
decision = "Use voom-limma pipeline with batch correction",
reasoning = "Account for mean-variance relationship and batch effects",
evidence = c("plots/voom_plot.png", "data/processed/de_results.csv")
)
2. Visualization
Create publication-ready figures:
# Create volcano plot
results |>
ggplot(aes(x = logFC, y = -log10(adj.P.Val))) +
geom_point(
aes(color = adj.P.Val < 0.05 & abs(logFC) > 1),
alpha = 0.6
) +
scale_color_manual(
values = c("grey", "red"),
labels = c("Non-significant", "DE genes")
) +
theme_minimal() +
labs(
title = "Differential Expression Analysis",
x = "log2 Fold Change",
y = "-log10 Adjusted P-value",
color = "Significance"
) |>
ggsave(
"plots/volcano_plot.png",
width = 10,
height = 8
)
# Create heatmap of top DE genes
top_genes <- results |>
filter(adj.P.Val < 0.05, abs(logFC) > 1) |>
slice_head(n = 50) |>
pull(gene_id)
# Get normalized expression for top genes
expr_mat <- v$E[top_genes, ]
# Save heatmap
png("plots/heatmap.png", width = 800, height = 1000)
Heatmap(
expr_mat,
name = "Expression",
column_split = dge_filtered$samples$group,
show_row_names = FALSE,
column_title = "Top 50 DE Genes"
)
dev.off()
Export Analysis Documentation
Generate comprehensive documentation:
# Export decision tree
export_decision_tree(
decision_file,
format = "html",
output_path = "reports/analysis_decisions.html"
)
# Generate methods section
methods_text <- generate_methods_section(decision_file)
writeLines(methods_text, "reports/methods_section.md")
Complete DVC Pipeline
Our analysis is now fully tracked and reproducible:
# Check pipeline status
dvc_status()
# Reproduce entire analysis
dvc_repro()
# Push to remote storage
dvc_push()
Conclusion
This example demonstrates how to: - Track RNA-seq data with DVC - Document QC and analysis decisions - Create reproducible bioinformatics pipelines - Generate publication-ready figures - Share results and methods
The entire analysis is version controlled and reproducible, with: - Raw data tracked by DVC - Code in Git - Dependencies managed by renv - Analysis decisions documented - Docker environment for reproducibility
Next Steps
- Adapt this workflow to your own RNA-seq data
- Explore more advanced features in
vignette("dvc-tracking")
- Learn about custom templates in
vignette("custom-templates")
- Check out Git integration in
vignette("git-integration")