Lab 13: Soil MAG Community Analysis from NEON Sites

Data Preparation

This dataset contains MAGs collected from NEON soil metagenomes.
Taxonomy and sample site IDs are used to evaluate microbial community structure.

library(tidyverse)
library(vegan)

neon <- read_csv("~/Bio 478 Lab/NEON_soilMAGs_soilChem.csv") |>
  filter(!is.na(phylum))

Distribution of MAGs Across Sites

Counting MAGs per site illustrates sampling depth differences and general variation in recovered genome counts.

neon |>
  count(site_ID, name = "n_MAGs") |>
  arrange(desc(n_MAGs)) |>
  slice_head(n = 15) |>
  ggplot(aes(x = reorder(site_ID, n_MAGs), y = n_MAGs)) +
  geom_col() +
  coord_flip() +
  xlab("Site") +
  ylab("MAG Count") +
  ggtitle("Top Sites by Number of MAGs")


Dominant Phyla Across All Sites

Phylum frequency highlights which broad microbial lineages are most detected in the MAG dataset.

neon |>
  count(phylum, name = "n_MAGs") |>
  arrange(desc(n_MAGs)) |>
  slice_head(n = 15) |>
  ggplot(aes(x = reorder(phylum, n_MAGs), y = n_MAGs)) +
  geom_col() +
  coord_flip() +
  xlab("Phylum") +
  ylab("MAG Count") +
  ggtitle("Most Represented Phyla")


Dominant Classes Across All Sites

Class-level patterns refine broad phylum profiles and show more detailed taxonomy differences.

if ("class" %in% names(neon)) {
  neon |>
    filter(!is.na(class)) |>
    count(class, name = "n_MAGs") |>
    arrange(desc(n_MAGs)) |>
    slice_head(n = 15) |>
    ggplot(aes(x = reorder(class, n_MAGs), y = n_MAGs)) +
    geom_col() +
    coord_flip() +
    xlab("Class") +
    ylab("MAG Count") +
    ggtitle("Most Represented Classes Across All Sites")
}


Phylum Composition by Site

Relative phylum proportions show taxonomic differences among sampling locations.
MAG count per phylum serves as a proxy for presence because read abundance data are not included in this dataset.

phylum_rel2 |>
  ggplot(aes(x = site_ID, y = rel_abundance, fill = phylum_top)) +
  geom_col() +
  xlab("Site") +
  ylab("Relative Abundance") +
  ggtitle("Relative Phylum Proportions at Each Site") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))


Heatmap of Phylum Counts by Site

This heatmap visualizes absolute phylum-level presence by site.

top20 <- neon |>
  count(phylum) |>
  arrange(desc(n)) |>
  slice_head(n = 20) |>
  pull(phylum)

heat_df <- neon |>
  filter(phylum %in% top20) |>
  count(site_ID, phylum, name = "n_MAGs")

ggplot(heat_df, aes(x = phylum,
                    y = site_ID,
                    fill = n_MAGs)) +
  geom_tile() +
  scale_fill_viridis_c(
    option = "C",
    direction = 1,
    na.value = "white"
  ) +
  xlab("Phylum") +
  ylab("Site ID") +
  ggtitle("Heatmap of MAG Counts for Top 20 Phyla Across Sites") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(
      angle = 45,
      hjust = 1,
      size = 8
    ),
    axis.text.y = element_text(size = 8),
    plot.title = element_text(size = 12, face = "bold")
  )


Alpha Diversity of MAG Communities

Alpha diversity describes within-site diversity.
Richness counts detected phyla, while the Shannon index increases as both richness and evenness rise.

alpha_long |>
  ggplot(aes(x = reorder(site_ID, Value), y = Value)) +
  geom_col() +
  facet_wrap(~ Metric, scales = "free_y") +
  xlab("Site") +
  ylab("Diversity Value") +
  ggtitle("Alpha Diversity (Shannon Index and Richness)") +
  theme(axis.text.x = element_text(angle = 60, hjust = 1))


Shannon Diversity vs Soil pH

Shannon diversity is compared to soil pH to examine possible environmental influence.

ggplot(alpha_env, aes(x = soil_pH, y = Shannon)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  xlab("Soil pH") +
  ylab("Shannon Diversity") +
  ggtitle("Relationship Between Shannon Diversity and Soil pH")


Richness vs Soil Moisture

Richness is plotted against soil moisture to explore another environmental gradient.

ggplot(alpha_env, aes(x = moisture, y = Richness)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "lm", se = FALSE) +
  xlab("Soil Moisture") +
  ylab("Richness") +
  ggtitle("Richness Across Soil Moisture Gradient")


Bray–Curtis Distance Between Sites

Bray–Curtis measures compositional dissimilarity (0 = identical, 1 = no shared taxa).
Site clustering and heatmaps reveal similarities in microbial structure.

plot(
  hclust(bray_dist, method = "average"),
  labels = comm_matrix$site_ID,
  main = "Hierarchical Clustering of Sites (Bray–Curtis Distance)",
  xlab = "", sub = ""
)

n_show <- min(30, nrow(comm_only))
bray_mat <- as.matrix(bray_dist)[1:n_show, 1:n_show]
rownames(bray_mat) <- comm_matrix$site_ID[1:n_show]
colnames(bray_mat) <- comm_matrix$site_ID[1:n_show]

heatmap(
  bray_mat,
  Rowv = NA, Colv = NA,
  scale = "none",
  main = "Bray–Curtis Distance Heatmap (Subset)"
)


PCA Ordination Based on Phylum Counts

PCA provides a linear ordination alternative to NMDS.

ggplot(pca_df, aes(x = PC1, y = PC2)) +
  geom_point(size = 3, alpha = 0.7) +
  xlab(paste0("PC1 (", pc1_var, "%)")) +
  ylab(paste0("PC2 (", pc2_var, "%)")) +
  ggtitle("PCA of Sites Using Phylum Counts") +
  theme_minimal()


NMDS Ordination of Sites

NMDS projects multidimensional distance data into two axes.
Samples positioned closely contain more similar MAG composition.

ggplot(ord_df, aes(x = MDS1, y = MDS2)) +
  geom_point(size = 3, alpha = 0.7) +
  xlab("NMDS1") +
  ylab("NMDS2") +
  ggtitle("NMDS Ordination of Sites (Bray–Curtis)") +
  theme_minimal()


NMDS Ordination Colored by Soil pH

Overlaying soil pH reveals whether environmental gradients align with compositional differences in ordination space.

ggplot(ord_env, aes(x = MDS1, y = MDS2, colour = soilInWaterpH)) +
  geom_point(size = 3) +
  scale_colour_viridis_c(option = "C", na.value = "grey70") +
  xlab("NMDS1") +
  ylab("NMDS2") +
  ggtitle("NMDS Ordination with Soil pH Gradient") +
  theme_minimal()


NMDS with Convex Hull By Site

Convex hull polygons outline clusters of points for each site in ordination space.

hull_df <- ord_df |>
  group_by(site_ID) |>
  slice(chull(MDS1, MDS2))
ggplot(ord_df, aes(MDS1, MDS2, colour = site_ID)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_polygon(
    data = hull_df,
    aes(fill = site_ID),
    alpha = 0.12,
    colour = NA
  ) +
  guides(fill = FALSE) +
  xlab("NMDS1") +
  ylab("NMDS2") +
  ggtitle("NMDS with Site Convex Hull Groups") +
  theme_minimal()


Summary of Observations

Across NEON sites, MAG composition differs at the phylum level.
Shannon diversity and richness values show uneven community structure among locations.
Bray–Curtis clustering and distance heatmaps indicate groups of sites with similar microbial profiles.
PCA and NMDS ordinations both suggest that soil pH and moisture may relate to differences in community composition.
Because read-based abundance data are unavailable, interpretations are based on MAG presence rather than quantitative abundance.