Ingest AquaMaps to SDM DuckDB

Transform downscaled AquaMaps data into DuckDB for efficient biodiversity analysis

Published

2025-05-27 20:28:46

1 Overview

This notebook ingests downscaled AquaMaps species distribution models (0.05° resolution) into a DuckDB database (msens_sdm) for efficient querying and biodiversity metric calculation. The database schema supports multiple SDM sources and includes spatial functions for raster generation and polygon summarization.

Code

librarian::shelf(
  DBI, dbplyr, dplyr, duckdb, fs, glue, here, leaflet, mapgl, purrr, 
  sf, terra, tibble, tidyr, quiet = T )

source(here("libs/db.R"))
source(here("libs/am_functions.R"))
source(here("libs/sdm_functions.R"))

# set up directories
is_server <-  Sys.info()[["sysname"]] == "Linux"
dir_data <- ifelse(
  is_server,
  "/share/data",
  "~/My Drive/projects/msens/data")

# database setup
db_path <- glue("{dir_data}/derived/msens_sdm.duckdb")
con_sdm <- dbConnect(duckdb(), dbdir = db_path, read_only = FALSE)
# dbDisconnect(con_sdm, shutdown = T)
# duckdb_shutdown(duckdb())

# Check if spatial extension is already loaded
check_spatial <- function(con) {
  tryCatch({
    # Try to use a spatial function - if it works, extension is loaded
    res <- dbGetQuery(con, "SELECT ST_Point(0, 0)")
    return(TRUE)
  }, error = function(e) {
    return(FALSE)
  })
}
# Only install/load if not already available
if (!check_spatial(con_sdm)) {
  res <- dbExecute(con_sdm, "INSTALL spatial; LOAD spatial;")
}

2 Database Schema

The database schema supports multiple species per model and includes biodiversity metrics with flexible regional aggregation.

erDiagram
%% https://mermaid.js.org/syntax/entityRelationshipDiagram.html

%% tables
dataset {
  str ds_key          PK  "dataset key"
  str name_short          "short name: {source_broad} {taxa_groups} {response_type} in {regions}, {year_pub}"
  str name_original       "original dataset name"
  str description
  str citation
  str source_broad        "e.g. NOAA, AquaMaps, Duke, NCCOS"
  str source_detail       "e.g. NOAA Southeast Fisheries Science Center"
  str regions             "e.g. Global, Atlantic, Pacific, Gulf of America"
  str response_type       "occurrence, range, suitability, biomass, density, diversity"
  str taxa_groups         "fish, invertebrates, marine mammals, cetaceans, sea turtles, seabirds"
  int year_pub            "year published"
  date date_obs_beg       "first observation date"
  date date_obs_end       "last observation date"
  date date_env_beg       "first environmental data date"
  date date_env_end       "last environmental data date"
  str link_info           "primary information URL"
  str link_download       "download URL"
  str link_metadata       "metadata URL"
  str links_other         "other URLs space-separated"
  dbl spatial_res_deg     "spatial resolution in decimal degrees"
  str temporal_res        "temporal resolution"
  date date_created       "date added to database"
}

model {
  int mdl_seq         PK  "sequential model ID"
  str ds_key          FK  "dataset key"
  str taxa                "taxonomic identifier"
  str time_period         "time period"
  str region              "geographic region"
  str mdl_type            "occurrence, range, suitability, biomass, density, diversity"
  str description
  date date_created       "date added to database"
}

species {
  int sp_id           PK  "species ID"
  str ds_key          FK  "dataset key"
  str taxa            FK  "links to model.taxa"
  str sp_key              "species key"
  int aphia_id            "WoRMS Aphia ID"
  int gbif_id             "GBIF ID"
  int itis_id             "ITIS ID"
  str scientific_name
  str common_name
  str sp_cat              "fish, mollusk, crustacean, etc."
}

cell {
  int cell_id         PK  "cell ID"
  dbl lon                 "longitude"
  dbl lat                 "latitude"
  dbl area_km2            "cell area in square kilometers"
  geo geom                "cell geometry"
}

model_cell {
  int cell_id         FK  "cell ID"
  int mdl_seq         FK  "model sequential ID"
  dbl value               "model value"
}

metric {
  int metric_seq      PK  "auto-incrementing metric ID"
  str metric_type         "species_richness, shannon_index, etc."
  str description
  date date_created       "date metric was created"
}

cell_metric {
  int cell_id         FK  "cell ID"
  int metric_seq      FK  "metric ID"
  dbl value               "calculated metric value"
}

region {
  int rgn_seq         PK  "auto-incrementing region ID"
  str tbl_col             "{table}.{column} e.g. planning_areas.pa_key"
  str value               "unique identifier value e.g. CHU"
  str col_geom            "geometry column name"
  date date_created       "date added to database"
}

region_metric {
  int rgn_seq         FK  "region ID"
  int metric_seq      FK  "metric ID"
  dbl value               "calculated metric value"
}

region_cell {
  int rgn_seq         FK  "region ID"
  int cell_id         FK  "cell ID"
  int pct_covered         "percentage of cell covered by region"
}

planning_areas {
  str pa_key          PK  "planning area key"
  str pa_name
  geo geom                "planning area geometry"
}

%% relationships
dataset ||--o{ model : ds_key
dataset ||--o{ species : ds_key
model ||--|{ species : "ds_key,taxa"
model ||--o{ model_cell : mdl_seq
cell ||--o{ model_cell : cell_id
cell ||--o{ cell_metric : cell_id
cell ||--o{ region_cell : cell_id
metric ||--o{ cell_metric : metric_seq
metric ||--o{ region_metric : metric_seq
region ||--o{ region_metric : rgn_seq
region ||--o{ region_cell : rgn_seq

3 Create Database Tables

Code

# drop existing tables if needed
if (dbExistsTable(con_sdm, "model_cell") || dbExistsTable(con_sdm, "sdm_values")) {
  tables_to_drop <- c(
    "region_metric", "region_cell", "cell_metric", "env_layers", 
    "model_cell", "sdm_values", "species", "model", "dataset", 
    "cell", "sdm_cells", "metric", "region", "planning_areas",
    "biodiv_metrics", "planning_area_metrics", "sdm_species", 
    "sdm_models", "sdm_sources"
  )
  
  for (tbl in tables_to_drop) {
    if (dbExistsTable(con_sdm, tbl))
      dbExecute(con_sdm, glue("DROP TABLE {tbl} CASCADE"))
  }
}

# create tables
dbExecute(con_sdm, "
CREATE TABLE dataset (
  ds_key VARCHAR PRIMARY KEY,
  name_short VARCHAR NOT NULL,
  name_original VARCHAR,
  description TEXT,
  citation TEXT,
  source_broad VARCHAR,
  source_detail VARCHAR,
  regions VARCHAR,
  response_type VARCHAR,
  taxa_groups VARCHAR,
  year_pub INTEGER,
  date_obs_beg DATE,
  date_obs_end DATE,
  date_env_beg DATE,
  date_env_end DATE,
  link_info VARCHAR,
  link_download VARCHAR,
  link_metadata VARCHAR,
  links_other VARCHAR,
  spatial_res_deg DOUBLE,
  temporal_res VARCHAR,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE SEQUENCE model_mdl_seq_seq START 1")

dbExecute(con_sdm, "
CREATE TABLE model (
  mdl_seq INTEGER PRIMARY KEY DEFAULT nextval('model_mdl_seq_seq'),
  ds_key VARCHAR REFERENCES dataset(ds_key),
  taxa VARCHAR,
  time_period VARCHAR,
  region VARCHAR,
  mdl_type VARCHAR,
  description TEXT,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE TABLE species (
  sp_id INTEGER PRIMARY KEY,
  ds_key VARCHAR REFERENCES dataset(ds_key),
  taxa VARCHAR,
  sp_key VARCHAR NOT NULL,
  aphia_id INTEGER,
  gbif_id INTEGER,
  itis_id INTEGER,
  scientific_name VARCHAR,
  common_name VARCHAR,
  sp_cat VARCHAR
)")

dbExecute(con_sdm, "
CREATE TABLE cell (
  cell_id INTEGER PRIMARY KEY,
  lon DOUBLE NOT NULL,
  lat DOUBLE NOT NULL,
  area_km2 DOUBLE NOT NULL,
  geom GEOMETRY
)")

dbExecute(con_sdm, "
CREATE TABLE model_cell (
  cell_id INTEGER REFERENCES cell(cell_id),
  mdl_seq INTEGER REFERENCES model(mdl_seq),
  value DOUBLE,
  PRIMARY KEY (cell_id, mdl_seq)
)")

dbExecute(con_sdm, "
CREATE SEQUENCE metric_metric_seq_seq START 1")

dbExecute(con_sdm, "
CREATE TABLE metric (
  metric_seq INTEGER PRIMARY KEY DEFAULT nextval('metric_metric_seq_seq'),
  metric_type VARCHAR NOT NULL,
  description TEXT,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE TABLE cell_metric (
  cell_id INTEGER REFERENCES cell(cell_id),
  metric_seq INTEGER REFERENCES metric(metric_seq),
  value DOUBLE,
  PRIMARY KEY (cell_id, metric_seq)
)")

dbExecute(con_sdm, "
CREATE SEQUENCE region_rgn_seq_seq START 1")

dbExecute(con_sdm, "
CREATE TABLE region (
  rgn_seq INTEGER PRIMARY KEY DEFAULT nextval('region_rgn_seq_seq'),
  tbl_col VARCHAR NOT NULL,
  value VARCHAR NOT NULL,
  col_geom VARCHAR NOT NULL,
  date_created DATE DEFAULT CURRENT_DATE
)")

dbExecute(con_sdm, "
CREATE TABLE region_metric (
  rgn_seq INTEGER REFERENCES region(rgn_seq),
  metric_seq INTEGER REFERENCES metric(metric_seq),
  value DOUBLE,
  PRIMARY KEY (rgn_seq, metric_seq)
)")

dbExecute(con_sdm, "
CREATE TABLE region_cell (
  rgn_seq INTEGER REFERENCES region(rgn_seq),
  cell_id INTEGER REFERENCES cell(cell_id),
  pct_covered INTEGER,
  PRIMARY KEY (rgn_seq, cell_id)
)")

dbExecute(con_sdm, "
CREATE TABLE planning_areas (
  pa_key VARCHAR PRIMARY KEY,
  pa_name VARCHAR,
  geom GEOMETRY
)")

# create indexes
dbExecute(con_sdm, "CREATE INDEX idx_model_cell_cell_id ON model_cell(cell_id)")
dbExecute(con_sdm, "CREATE INDEX idx_model_cell_mdl_seq ON model_cell(mdl_seq)")
dbExecute(con_sdm, "CREATE INDEX idx_cell_coords ON cell(lon, lat)")
dbExecute(con_sdm, "CREATE INDEX idx_species_taxa ON species(ds_key, taxa)")
dbExecute(con_sdm, "CREATE INDEX idx_model_taxa ON model(ds_key, taxa)")
dbExecute(con_sdm, "CREATE INDEX idx_region_cell_rgn ON region_cell(rgn_seq)")
dbExecute(con_sdm, "CREATE INDEX idx_region_cell_cell ON region_cell(cell_id)")
dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_cell_geom ON cell(geom)")
dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_planning_areas_geom ON planning_areas(geom)")

4 Ingest AquaMaps Source

Code

# add AquaMaps as a source
aquamaps_dataset <- tibble(
  ds_key          = "am_0.05",
  name_short      = "AquaMaps fish suitability in Global, 2023",
  name_original   = "AquaMaps: Predicted range maps for aquatic species",
  description     = "AquaMaps species distribution models downscaled to 0.05° using Bio-Oracle v3 environmental layers",
  citation        = "Kaschner et al. (2023) AquaMaps: Predicted range maps for aquatic species. Version 10/2019. www.aquamaps.org",
  source_broad    = "AquaMaps",
  source_detail   = "AquaMaps.org collaborative project",
  regions         = "Global",
  response_type   = "suitability",
  taxa_groups     = "fish",
  year_pub        = 2023,
  date_obs_beg    = as.Date("1990-01-01"),
  date_obs_end    = as.Date("2020-12-31"),
  date_env_beg    = as.Date("2000-01-01"),
  date_env_end    = as.Date("2020-12-31"),
  link_info       = "https://www.aquamaps.org",
  link_download   = "https://www.aquamaps.org/download/main.php",
  link_metadata   = "https://www.aquamaps.org/main/alg_2019.php",
  links_other     = "https://www.fishbase.org https://www.sealifebase.org",
  spatial_res_deg = 0.05,
  temporal_res    = "static"
)

dbWriteTable(con_sdm, "dataset", aquamaps_dataset, append = TRUE)

5 Load Planning Areas

Code

# read planning areas from existing database
ply_pa <- st_read(con, "ply_planareas_2025")

# simplify and load into DuckDB
planning_areas <- ply_pa |>
  select(pa_key = planarea_key, pa_name = planarea_name, geom) |> # TODO: keep original
  st_transform(4326)

# write to DuckDB (convert geometry to WKT first)
pa_df <- planning_areas |>
  mutate(geom_wkt = st_as_text(geom)) |>
  st_drop_geometry()

dbExecute(con_sdm, "DELETE FROM planning_areas")
for (i in 1:nrow(pa_df)) {
  dbExecute(con_sdm, glue("
    INSERT INTO planning_areas (pa_key, pa_name, geom)
    VALUES ('{pa_df$pa_key[i]}', '{pa_df$pa_name[i]}', 
            ST_GeomFromText('{pa_df$geom_wkt[i]}'))
  "))
}

6 Create 0.05° Grid Cells

Code

# create global 0.05 degree raster template
res <- 0.05
r_global <- terra::rast(
  xmin = -180, xmax = 180,
  ymin = -90,  ymax = 90,
  resolution = res,
  crs = "EPSG:4326"
)

# get cell IDs and coordinates using terra
cell_ids <- terra::cells(r_global)
coords <- terra::xyFromCell(r_global, cell_ids)

# calculate cell areas in km2
# for 0.05 degree cells, area varies by latitude
cell_areas <- terra::cellSize(r_global, unit = "km")
areas_km2 <- terra::values(cell_areas)[cell_ids]

# create grid cells dataframe
grid_cells <- tibble(
  cell_id  = cell_ids,
  lon      = coords[, 1],
  lat      = coords[, 2],
  area_km2 = areas_km2
)

# batch insert cells
batch_size <- 10000
n_batches <- ceiling(nrow(grid_cells) / batch_size)

message(glue("Inserting {nrow(grid_cells)} grid cells in {n_batches} batches..."))

for (i in 1:n_batches) {
  start_idx <- (i - 1) * batch_size + 1
  end_idx <- min(i * batch_size, nrow(grid_cells))
  
  batch <- grid_cells[start_idx:end_idx, ]
  
  # create WKT points for geometries
  batch <- batch |>
    mutate(
      geom_wkt = glue("POINT({lon} {lat})")
    )
  
  # insert batch
  for (j in 1:nrow(batch)) {
    dbExecute(con_sdm, glue("
      INSERT INTO cell (cell_id, lon, lat, area_km2, geom)
      VALUES ({batch$cell_id[j]}, {batch$lon[j]}, {batch$lat[j]}, 
              {batch$area_km2[j]}, ST_GeomFromText('{batch$geom_wkt[j]}'))
    "))
  }
  
  if (i %% 10 == 0) {
    message(glue("  Completed batch {i}/{n_batches}"))
  }
}

# save the global raster template for later use
saveRDS(r_global, glue("{dir_data}/derived/r_global_0.05.rds"))

7 Process Aquamaps downsized models

Code

# this chunk assumes the iterate_species_in_planareas chunk from 
# ingest_aquamaps_res05.qmd has been run and produced species rasters

# get list of species rasters
dir_tifs <- glue("{dir_data}/derived/aquamaps.org/spp_cells0.05_cog")
spp_tifs <- list.files(species_dir, pattern = "*.tif$", full.names = TRUE)

# load global raster template
r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))

# process each species
sp_id_counter <- 1

for (sp_tif in spp_tifs[1:10]) { # process first 10 for testing
  
  sp_key <- basename(sp_file) |> path_ext_remove()
  message(glue("Processing species: {sp_key}"))
  
  # read species raster
  r_sp <- rast(sp_tif)
  
  # get species info from original database
  sp_info <- tbl(con, "spp") |>
    filter(sp_key == !!sp_key) |>
    collect()

  # create model for this species (one model per species for AquaMaps)
  model_info <- tibble(
    ds_key      = "am_0.05",
    taxa        = sp_key,  # for AquaMaps, taxa equals sp_key
    time_period = "2019",
    region      = "Global",
    mdl_type    = "suitability",
    description = glue(
      "AquaMaps suitability for {sp_info$genus[1]} {sp_info$species[1]}, interpolated to 0.05° resolution") )
  
  dbWriteTable(con_sdm, "model", model_info, append = TRUE)
  
  # get the mdl_seq that was just created
  mdl_seq <- dbGetQuery(con_sdm, glue("
    SELECT mdl_seq FROM model 
    WHERE ds_key = '{model_info$ds_key}' 
      AND taxa = '{model_info$taxa}'
    ORDER BY mdl_seq DESC LIMIT 1 "))$mdl_seq
  
  # determine species category
  sp_cat <- case_when(
    sp_info$class[1] == "Actinopterygii" ~ "fish",
    sp_info$class[1] == "Chondrichthyes" ~ "fish",
    sp_info$class[1] == "Mammalia" ~ "marine mammal",
    sp_info$class[1] == "Reptilia" ~ "sea turtle",
    sp_info$phylum[1] == "Mollusca" ~ "mollusk",
    sp_info$phylum[1] == "Arthropoda" && sp_info$subphylum[1] == "Crustacea" ~ "crustacean",
    TRUE ~ "other"
  )
  
  # add species to database
  species_info <- tibble(
    sp_id           = sp_id_counter,
    ds_key          = "am_0.05",
    taxa            = sp_key,
    sp_key          = sp_key,
    aphia_id        = sp_info$aphia_id[1],
    gbif_id         = NA_integer_,  # TODO: lookup from GBIF
    itis_id         = NA_integer_,  # TODO: lookup from ITIS
    scientific_name = glue("{sp_info$genus[1]} {sp_info$species[1]}"),
    common_name     = sp_info$common_name[1],
    sp_cat          = sp_cat)
  
  dbWriteTable(con_sdm, "species", species_info, append = TRUE)
  
  # extract values using terra::cells to ensure matching
  # get cell IDs that have values
  cell_values <- terra::values(r_sp, mat = FALSE, dataframe = TRUE, na.rm = TRUE)
  cell_numbers <- terra::cells(r_sp)[!is.na(terra::values(r_sp))]
  
  # create model_cell records
  model_cell_df <- tibble(
    cell_id = cell_numbers,
    mdl_seq = mdl_seq,
    value   = cell_values[[1]] / 100  # convert from percentage to probability
  )
  
  # batch insert values
  batch_insert_values(con_sdm, model_cell_df, "model_cell")
  
  sp_id_counter <- sp_id_counter + 1
}

8 Calculate Biodiversity Metrics

Code

# first create metric definitions
metrics_to_create <- tibble::tribble(
  ~metric_type,         ~description,
  "species_richness",   "Number of species with suitability > 0.5",
  "shannon_index",      "Shannon diversity index based on suitability values",
  "simpson_index",      "Simpson diversity index based on suitability values"
)

for (i in 1:nrow(metrics_to_create)) {
  dbExecute(con_sdm, glue("
    INSERT INTO metric (metric_type, description)
    VALUES ('{metrics_to_create$metric_type[i]}', '{metrics_to_create$description[i]}')
  "))
}

# get metric IDs
metric_ids <- dbGetQuery(con_sdm, "SELECT metric_seq, metric_type FROM metric")

# calculate species richness per cell
message("Calculating species richness...")

richness_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "species_richness"]

dbExecute(con_sdm, glue("
INSERT INTO cell_metric (cell_id, metric_seq, value)
SELECT 
  mc.cell_id,
  {richness_metric_seq} as metric_seq,
  COUNT(DISTINCT m.taxa) as value
FROM model_cell mc
JOIN model m ON mc.mdl_seq = m.mdl_seq
WHERE mc.value > 0.5  -- presence threshold
GROUP BY mc.cell_id
"))

# calculate Shannon diversity index
message("Calculating Shannon diversity...")

shannon_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "shannon_index"]

dbExecute(con_sdm, glue("
INSERT INTO cell_metric (cell_id, metric_seq, value)
SELECT 
  cell_id,
  {shannon_metric_seq} as metric_seq,
  -SUM(p * LN(p)) as value
FROM (
  SELECT 
    mc.cell_id,
    mc.value / SUM(mc.value) OVER (PARTITION BY mc.cell_id) as p
  FROM model_cell mc
  WHERE mc.value > 0
) t
WHERE p > 0
GROUP BY cell_id
"))

9 Generate Biodiversity Raster

Code

# load global raster template
r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds"))

# get species richness metric ID
richness_metric_seq <- dbGetQuery(con_sdm, "
  SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'
")$metric_seq

# query species richness for a region
richness_df <- dbGetQuery(con_sdm, glue("
SELECT 
  cm.cell_id,
  cm.value as species_richness
FROM cell_metric cm
WHERE cm.metric_seq = {richness_metric_seq}
  AND cm.cell_id IN (
    SELECT cell_id FROM cell 
    WHERE lon BETWEEN -180 AND -60  -- limit to a region for testing
      AND lat BETWEEN 20 AND 50
  )
"))

# convert to raster using the new function
r_richness <- create_raster_from_df(
  richness_df, 
  r_template = r_global,
  value_cols = "species_richness"
)

# display with mapgl
mapgl_rast <- function(r, palette = "viridis", layer_name = "SDM") {
  # convert raster to data frame
  r_df <- as.data.frame(r, xy = TRUE) |>
    rename(value = 3)
  
  # create mapgl visualization
  mapgl() |>
    add_raster_tile_layer(
      id = "sdm_raster",
      source = r,
      source_layer = layer_name,
      color_palette = palette
    )
}

# visualize
mapgl_rast(r_richness, palette = "YlOrRd", layer_name = "Species Richness")

10 Summarize by Planning Area

Code

# first, register planning areas as regions
planning_area_regions <- dbGetQuery(con_sdm, "
  SELECT pa_key, pa_name FROM planning_areas
")

for (i in 1:nrow(planning_area_regions)) {
  dbExecute(con_sdm, glue("
    INSERT INTO region (tbl_col, value, col_geom)
    VALUES ('planning_areas.pa_key', '{planning_area_regions$pa_key[i]}', 'geom')
  "))
}

# get region and metric IDs
region_ids <- dbGetQuery(con_sdm, "
  SELECT rgn_seq, value FROM region WHERE tbl_col = 'planning_areas.pa_key'
")

richness_metric_seq <- dbGetQuery(con_sdm, "
  SELECT metric_seq FROM metric WHERE metric_type = 'species_richness'
")$metric_seq

# calculate average species richness per planning area
message("Calculating planning area metrics...")

for (i in 1:nrow(region_ids)) {
  pa_key <- region_ids$value[i]
  rgn_seq <- region_ids$rgn_seq[i]
  
  # calculate metric for this region
  result <- dbGetQuery(con_sdm, glue("
    SELECT AVG(cm.value) as avg_value
    FROM planning_areas p
    JOIN cell c ON ST_Within(c.geom, p.geom)
    JOIN cell_metric cm ON c.cell_id = cm.cell_id
    WHERE p.pa_key = '{pa_key}'
      AND cm.metric_seq = {richness_metric_seq}
  "))
  
  if (!is.na(result$avg_value[1])) {
    dbExecute(con_sdm, glue("
      INSERT INTO region_metric (rgn_seq, metric_seq, value)
      VALUES ({rgn_seq}, {richness_metric_seq}, {result$avg_value[1]})
    "))
  }
}

# query results
pa_metrics <- dbGetQuery(con_sdm, "
SELECT 
  p.pa_key,
  p.pa_name,
  rm.value as avg_richness
FROM region r
JOIN region_metric rm ON r.rgn_seq = rm.rgn_seq
JOIN planning_areas p ON r.value = p.pa_key
WHERE r.tbl_col = 'planning_areas.pa_key'
  AND rm.metric_seq = (SELECT metric_seq FROM metric WHERE metric_type = 'species_richness')
ORDER BY rm.value DESC
")

print(pa_metrics)

11 Add Environmental Layers

Code

# example: add primary productivity from ingest_productivity.qmd
npp_tif <- here("data/derived/vgpm-viirs_2023-daily-avg.tif")

if (file.exists(npp_tif)) {
  r_npp <- rast(npp_tif)
  
  # resample to 0.05 degree resolution
  r_template <- rast(
    extent = ext(-180, 180, -90, 90),
    resolution = 0.05,
    crs = "EPSG:4326"
  )
  
  r_npp_res <- resample(r_npp, r_template, method = "bilinear")
  
  # extract values
  npp_df <- as.data.frame(r_npp_res, xy = TRUE) |>
    filter(!is.na(npp_daily_avg)) |>
    mutate(
      cell_lon = round(x / 0.05) * 0.05,
      cell_lat = round(y / 0.05) * 0.05
    )
  
  # insert into env_layers
  # (similar process as species values)
}

12 Export to Parquet

Code

# export tables to parquet for efficient storage
output_dir <- glue("{dir_data}/derived/sdm_parquet")
dir_create(output_dir)

# export each table
tables_to_export <- c(
  "dataset", "model", "species", 
  "cell", "model_cell", "metric",
  "cell_metric", "region", "region_metric", "region_cell",
  "planning_areas"
)

for (tbl in tables_to_export) {
  message(glue("Exporting {tbl} to parquet..."))
  
  df <- dbGetQuery(con_sdm, glue("SELECT * FROM {tbl}"))
  
  arrow::write_parquet(
    df, 
    file.path(output_dir, glue("{tbl}.parquet")),
    compression = "snappy"
  )
}

13 Close Connection

Code

dbDisconnect(con_sdm)

--- title: "Ingest AquaMaps to SDM DuckDB" subtitle: "Transform downscaled AquaMaps data into DuckDB for efficient biodiversity analysis" format: html: code-fold: true code-tools: true editor_options: chunk_output_type: console --- ## Overview This notebook ingests downscaled AquaMaps species distribution models (0.05° resolution) into a DuckDB database (`msens_sdm`) for efficient querying and biodiversity metric calculation. The database schema supports multiple SDM sources and includes spatial functions for raster generation and polygon summarization. ```{r setup} #| message: false #| warning: false librarian::shelf( DBI, dbplyr, dplyr, duckdb, fs, glue, here, leaflet, mapgl, purrr, sf, terra, tibble, tidyr, quiet = T ) source(here("libs/db.R")) source(here("libs/am_functions.R")) source(here("libs/sdm_functions.R")) # set up directories is_server <- Sys.info()[["sysname"]] == "Linux" dir_data <- ifelse( is_server, "/share/data", "~/My Drive/projects/msens/data") # database setup db_path <- glue("{dir_data}/derived/msens_sdm.duckdb") con_sdm <- dbConnect(duckdb(), dbdir = db_path, read_only = FALSE) # dbDisconnect(con_sdm, shutdown = T) # duckdb_shutdown(duckdb()) # Check if spatial extension is already loaded check_spatial <- function(con) { tryCatch({ # Try to use a spatial function - if it works, extension is loaded res <- dbGetQuery(con, "SELECT ST_Point(0, 0)") return(TRUE) }, error = function(e) { return(FALSE) }) } # Only install/load if not already available if (!check_spatial(con_sdm)) { res <- dbExecute(con_sdm, "INSTALL spatial; LOAD spatial;") } ``` ## Database Schema The database schema supports multiple species per model and includes biodiversity metrics with flexible regional aggregation. ```{mermaid} erDiagram %% https://mermaid.js.org/syntax/entityRelationshipDiagram.html %% tables dataset { str ds_key PK "dataset key" str name_short "short name: {source_broad} {taxa_groups} {response_type} in {regions}, {year_pub}" str name_original "original dataset name" str description str citation str source_broad "e.g. NOAA, AquaMaps, Duke, NCCOS" str source_detail "e.g. NOAA Southeast Fisheries Science Center" str regions "e.g. Global, Atlantic, Pacific, Gulf of America" str response_type "occurrence, range, suitability, biomass, density, diversity" str taxa_groups "fish, invertebrates, marine mammals, cetaceans, sea turtles, seabirds" int year_pub "year published" date date_obs_beg "first observation date" date date_obs_end "last observation date" date date_env_beg "first environmental data date" date date_env_end "last environmental data date" str link_info "primary information URL" str link_download "download URL" str link_metadata "metadata URL" str links_other "other URLs space-separated" dbl spatial_res_deg "spatial resolution in decimal degrees" str temporal_res "temporal resolution" date date_created "date added to database" } model { int mdl_seq PK "sequential model ID" str ds_key FK "dataset key" str taxa "taxonomic identifier" str time_period "time period" str region "geographic region" str mdl_type "occurrence, range, suitability, biomass, density, diversity" str description date date_created "date added to database" } species { int sp_id PK "species ID" str ds_key FK "dataset key" str taxa FK "links to model.taxa" str sp_key "species key" int aphia_id "WoRMS Aphia ID" int gbif_id "GBIF ID" int itis_id "ITIS ID" str scientific_name str common_name str sp_cat "fish, mollusk, crustacean, etc." } cell { int cell_id PK "cell ID" dbl lon "longitude" dbl lat "latitude" dbl area_km2 "cell area in square kilometers" geo geom "cell geometry" } model_cell { int cell_id FK "cell ID" int mdl_seq FK "model sequential ID" dbl value "model value" } metric { int metric_seq PK "auto-incrementing metric ID" str metric_type "species_richness, shannon_index, etc." str description date date_created "date metric was created" } cell_metric { int cell_id FK "cell ID" int metric_seq FK "metric ID" dbl value "calculated metric value" } region { int rgn_seq PK "auto-incrementing region ID" str tbl_col "{table}.{column} e.g. planning_areas.pa_key" str value "unique identifier value e.g. CHU" str col_geom "geometry column name" date date_created "date added to database" } region_metric { int rgn_seq FK "region ID" int metric_seq FK "metric ID" dbl value "calculated metric value" } region_cell { int rgn_seq FK "region ID" int cell_id FK "cell ID" int pct_covered "percentage of cell covered by region" } planning_areas { str pa_key PK "planning area key" str pa_name geo geom "planning area geometry" } %% relationships dataset ||--o{ model : ds_key dataset ||--o{ species : ds_key model ||--|{ species : "ds_key,taxa" model ||--o{ model_cell : mdl_seq cell ||--o{ model_cell : cell_id cell ||--o{ cell_metric : cell_id cell ||--o{ region_cell : cell_id metric ||--o{ cell_metric : metric_seq metric ||--o{ region_metric : metric_seq region ||--o{ region_metric : rgn_seq region ||--o{ region_cell : rgn_seq ``` ```{r tmp_not_evel, include=FALSE} knitr::opts_chunk$set(eval = F) ``` ## Create Database Tables ```{r create_tables} #| message: false # drop existing tables if needed if (dbExistsTable(con_sdm, "model_cell") || dbExistsTable(con_sdm, "sdm_values")) { tables_to_drop <- c( "region_metric", "region_cell", "cell_metric", "env_layers", "model_cell", "sdm_values", "species", "model", "dataset", "cell", "sdm_cells", "metric", "region", "planning_areas", "biodiv_metrics", "planning_area_metrics", "sdm_species", "sdm_models", "sdm_sources" ) for (tbl in tables_to_drop) { if (dbExistsTable(con_sdm, tbl)) dbExecute(con_sdm, glue("DROP TABLE {tbl} CASCADE")) } } # create tables dbExecute(con_sdm, " CREATE TABLE dataset ( ds_key VARCHAR PRIMARY KEY, name_short VARCHAR NOT NULL, name_original VARCHAR, description TEXT, citation TEXT, source_broad VARCHAR, source_detail VARCHAR, regions VARCHAR, response_type VARCHAR, taxa_groups VARCHAR, year_pub INTEGER, date_obs_beg DATE, date_obs_end DATE, date_env_beg DATE, date_env_end DATE, link_info VARCHAR, link_download VARCHAR, link_metadata VARCHAR, links_other VARCHAR, spatial_res_deg DOUBLE, temporal_res VARCHAR, date_created DATE DEFAULT CURRENT_DATE )") dbExecute(con_sdm, " CREATE SEQUENCE model_mdl_seq_seq START 1") dbExecute(con_sdm, " CREATE TABLE model ( mdl_seq INTEGER PRIMARY KEY DEFAULT nextval('model_mdl_seq_seq'), ds_key VARCHAR REFERENCES dataset(ds_key), taxa VARCHAR, time_period VARCHAR, region VARCHAR, mdl_type VARCHAR, description TEXT, date_created DATE DEFAULT CURRENT_DATE )") dbExecute(con_sdm, " CREATE TABLE species ( sp_id INTEGER PRIMARY KEY, ds_key VARCHAR REFERENCES dataset(ds_key), taxa VARCHAR, sp_key VARCHAR NOT NULL, aphia_id INTEGER, gbif_id INTEGER, itis_id INTEGER, scientific_name VARCHAR, common_name VARCHAR, sp_cat VARCHAR )") dbExecute(con_sdm, " CREATE TABLE cell ( cell_id INTEGER PRIMARY KEY, lon DOUBLE NOT NULL, lat DOUBLE NOT NULL, area_km2 DOUBLE NOT NULL, geom GEOMETRY )") dbExecute(con_sdm, " CREATE TABLE model_cell ( cell_id INTEGER REFERENCES cell(cell_id), mdl_seq INTEGER REFERENCES model(mdl_seq), value DOUBLE, PRIMARY KEY (cell_id, mdl_seq) )") dbExecute(con_sdm, " CREATE SEQUENCE metric_metric_seq_seq START 1") dbExecute(con_sdm, " CREATE TABLE metric ( metric_seq INTEGER PRIMARY KEY DEFAULT nextval('metric_metric_seq_seq'), metric_type VARCHAR NOT NULL, description TEXT, date_created DATE DEFAULT CURRENT_DATE )") dbExecute(con_sdm, " CREATE TABLE cell_metric ( cell_id INTEGER REFERENCES cell(cell_id), metric_seq INTEGER REFERENCES metric(metric_seq), value DOUBLE, PRIMARY KEY (cell_id, metric_seq) )") dbExecute(con_sdm, " CREATE SEQUENCE region_rgn_seq_seq START 1") dbExecute(con_sdm, " CREATE TABLE region ( rgn_seq INTEGER PRIMARY KEY DEFAULT nextval('region_rgn_seq_seq'), tbl_col VARCHAR NOT NULL, value VARCHAR NOT NULL, col_geom VARCHAR NOT NULL, date_created DATE DEFAULT CURRENT_DATE )") dbExecute(con_sdm, " CREATE TABLE region_metric ( rgn_seq INTEGER REFERENCES region(rgn_seq), metric_seq INTEGER REFERENCES metric(metric_seq), value DOUBLE, PRIMARY KEY (rgn_seq, metric_seq) )") dbExecute(con_sdm, " CREATE TABLE region_cell ( rgn_seq INTEGER REFERENCES region(rgn_seq), cell_id INTEGER REFERENCES cell(cell_id), pct_covered INTEGER, PRIMARY KEY (rgn_seq, cell_id) )") dbExecute(con_sdm, " CREATE TABLE planning_areas ( pa_key VARCHAR PRIMARY KEY, pa_name VARCHAR, geom GEOMETRY )") # create indexes dbExecute(con_sdm, "CREATE INDEX idx_model_cell_cell_id ON model_cell(cell_id)") dbExecute(con_sdm, "CREATE INDEX idx_model_cell_mdl_seq ON model_cell(mdl_seq)") dbExecute(con_sdm, "CREATE INDEX idx_cell_coords ON cell(lon, lat)") dbExecute(con_sdm, "CREATE INDEX idx_species_taxa ON species(ds_key, taxa)") dbExecute(con_sdm, "CREATE INDEX idx_model_taxa ON model(ds_key, taxa)") dbExecute(con_sdm, "CREATE INDEX idx_region_cell_rgn ON region_cell(rgn_seq)") dbExecute(con_sdm, "CREATE INDEX idx_region_cell_cell ON region_cell(cell_id)") dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_cell_geom ON cell(geom)") dbExecute(con_sdm, "CREATE SPATIAL INDEX idx_planning_areas_geom ON planning_areas(geom)") ``` ## Ingest AquaMaps Source ```{r ingest_source} # add AquaMaps as a source aquamaps_dataset <- tibble( ds_key = "am_0.05", name_short = "AquaMaps fish suitability in Global, 2023", name_original = "AquaMaps: Predicted range maps for aquatic species", description = "AquaMaps species distribution models downscaled to 0.05° using Bio-Oracle v3 environmental layers", citation = "Kaschner et al. (2023) AquaMaps: Predicted range maps for aquatic species. Version 10/2019. www.aquamaps.org", source_broad = "AquaMaps", source_detail = "AquaMaps.org collaborative project", regions = "Global", response_type = "suitability", taxa_groups = "fish", year_pub = 2023, date_obs_beg = as.Date("1990-01-01"), date_obs_end = as.Date("2020-12-31"), date_env_beg = as.Date("2000-01-01"), date_env_end = as.Date("2020-12-31"), link_info = "https://www.aquamaps.org", link_download = "https://www.aquamaps.org/download/main.php", link_metadata = "https://www.aquamaps.org/main/alg_2019.php", links_other = "https://www.fishbase.org https://www.sealifebase.org", spatial_res_deg = 0.05, temporal_res = "static" ) dbWriteTable(con_sdm, "dataset", aquamaps_dataset, append = TRUE) ``` ## Load Planning Areas ```{r load_planning_areas} # read planning areas from existing database ply_pa <- st_read(con, "ply_planareas_2025") # simplify and load into DuckDB planning_areas <- ply_pa |> select(pa_key = planarea_key, pa_name = planarea_name, geom) |> # TODO: keep original st_transform(4326) # write to DuckDB (convert geometry to WKT first) pa_df <- planning_areas |> mutate(geom_wkt = st_as_text(geom)) |> st_drop_geometry() dbExecute(con_sdm, "DELETE FROM planning_areas") for (i in 1:nrow(pa_df)) { dbExecute(con_sdm, glue(" INSERT INTO planning_areas (pa_key, pa_name, geom) VALUES ('{pa_df$pa_key[i]}', '{pa_df$pa_name[i]}', ST_GeomFromText('{pa_df$geom_wkt[i]}')) ")) } ``` ## Create 0.05° Grid Cells ```{r create_grid} #| message: false # create global 0.05 degree raster template res <- 0.05 r_global <- terra::rast( xmin = -180, xmax = 180, ymin = -90, ymax = 90, resolution = res, crs = "EPSG:4326" ) # get cell IDs and coordinates using terra cell_ids <- terra::cells(r_global) coords <- terra::xyFromCell(r_global, cell_ids) # calculate cell areas in km2 # for 0.05 degree cells, area varies by latitude cell_areas <- terra::cellSize(r_global, unit = "km") areas_km2 <- terra::values(cell_areas)[cell_ids] # create grid cells dataframe grid_cells <- tibble( cell_id = cell_ids, lon = coords[, 1], lat = coords[, 2], area_km2 = areas_km2 ) # batch insert cells batch_size <- 10000 n_batches <- ceiling(nrow(grid_cells) / batch_size) message(glue("Inserting {nrow(grid_cells)} grid cells in {n_batches} batches...")) for (i in 1:n_batches) { start_idx <- (i - 1) * batch_size + 1 end_idx <- min(i * batch_size, nrow(grid_cells)) batch <- grid_cells[start_idx:end_idx, ] # create WKT points for geometries batch <- batch |> mutate( geom_wkt = glue("POINT({lon} {lat})") ) # insert batch for (j in 1:nrow(batch)) { dbExecute(con_sdm, glue(" INSERT INTO cell (cell_id, lon, lat, area_km2, geom) VALUES ({batch$cell_id[j]}, {batch$lon[j]}, {batch$lat[j]}, {batch$area_km2[j]}, ST_GeomFromText('{batch$geom_wkt[j]}')) ")) } if (i %% 10 == 0) { message(glue(" Completed batch {i}/{n_batches}")) } } # save the global raster template for later use saveRDS(r_global, glue("{dir_data}/derived/r_global_0.05.rds")) ``` ## Process Aquamaps downsized models ```{r process_species} #| eval: false # this chunk assumes the iterate_species_in_planareas chunk from # ingest_aquamaps_res05.qmd has been run and produced species rasters # get list of species rasters dir_tifs <- glue("{dir_data}/derived/aquamaps.org/spp_cells0.05_cog") spp_tifs <- list.files(species_dir, pattern = "*.tif$", full.names = TRUE) # load global raster template r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds")) # process each species sp_id_counter <- 1 for (sp_tif in spp_tifs[1:10]) { # process first 10 for testing sp_key <- basename(sp_file) |> path_ext_remove() message(glue("Processing species: {sp_key}")) # read species raster r_sp <- rast(sp_tif) # get species info from original database sp_info <- tbl(con, "spp") |> filter(sp_key == !!sp_key) |> collect() # create model for this species (one model per species for AquaMaps) model_info <- tibble( ds_key = "am_0.05", taxa = sp_key, # for AquaMaps, taxa equals sp_key time_period = "2019", region = "Global", mdl_type = "suitability", description = glue( "AquaMaps suitability for {sp_info$genus[1]} {sp_info$species[1]}, interpolated to 0.05° resolution") ) dbWriteTable(con_sdm, "model", model_info, append = TRUE) # get the mdl_seq that was just created mdl_seq <- dbGetQuery(con_sdm, glue(" SELECT mdl_seq FROM model WHERE ds_key = '{model_info$ds_key}' AND taxa = '{model_info$taxa}' ORDER BY mdl_seq DESC LIMIT 1 "))$mdl_seq # determine species category sp_cat <- case_when( sp_info$class[1] == "Actinopterygii" ~ "fish", sp_info$class[1] == "Chondrichthyes" ~ "fish", sp_info$class[1] == "Mammalia" ~ "marine mammal", sp_info$class[1] == "Reptilia" ~ "sea turtle", sp_info$phylum[1] == "Mollusca" ~ "mollusk", sp_info$phylum[1] == "Arthropoda" && sp_info$subphylum[1] == "Crustacea" ~ "crustacean", TRUE ~ "other" ) # add species to database species_info <- tibble( sp_id = sp_id_counter, ds_key = "am_0.05", taxa = sp_key, sp_key = sp_key, aphia_id = sp_info$aphia_id[1], gbif_id = NA_integer_, # TODO: lookup from GBIF itis_id = NA_integer_, # TODO: lookup from ITIS scientific_name = glue("{sp_info$genus[1]} {sp_info$species[1]}"), common_name = sp_info$common_name[1], sp_cat = sp_cat) dbWriteTable(con_sdm, "species", species_info, append = TRUE) # extract values using terra::cells to ensure matching # get cell IDs that have values cell_values <- terra::values(r_sp, mat = FALSE, dataframe = TRUE, na.rm = TRUE) cell_numbers <- terra::cells(r_sp)[!is.na(terra::values(r_sp))] # create model_cell records model_cell_df <- tibble( cell_id = cell_numbers, mdl_seq = mdl_seq, value = cell_values[[1]] / 100 # convert from percentage to probability ) # batch insert values batch_insert_values(con_sdm, model_cell_df, "model_cell") sp_id_counter <- sp_id_counter + 1 } ``` ## Calculate Biodiversity Metrics ```{r calc_biodiv_metrics} #| eval: false # first create metric definitions metrics_to_create <- tibble::tribble( ~metric_type, ~description, "species_richness", "Number of species with suitability > 0.5", "shannon_index", "Shannon diversity index based on suitability values", "simpson_index", "Simpson diversity index based on suitability values" ) for (i in 1:nrow(metrics_to_create)) { dbExecute(con_sdm, glue(" INSERT INTO metric (metric_type, description) VALUES ('{metrics_to_create$metric_type[i]}', '{metrics_to_create$description[i]}') ")) } # get metric IDs metric_ids <- dbGetQuery(con_sdm, "SELECT metric_seq, metric_type FROM metric") # calculate species richness per cell message("Calculating species richness...") richness_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "species_richness"] dbExecute(con_sdm, glue(" INSERT INTO cell_metric (cell_id, metric_seq, value) SELECT mc.cell_id, {richness_metric_seq} as metric_seq, COUNT(DISTINCT m.taxa) as value FROM model_cell mc JOIN model m ON mc.mdl_seq = m.mdl_seq WHERE mc.value > 0.5 -- presence threshold GROUP BY mc.cell_id ")) # calculate Shannon diversity index message("Calculating Shannon diversity...") shannon_metric_seq <- metric_ids$metric_seq[metric_ids$metric_type == "shannon_index"] dbExecute(con_sdm, glue(" INSERT INTO cell_metric (cell_id, metric_seq, value) SELECT cell_id, {shannon_metric_seq} as metric_seq, -SUM(p * LN(p)) as value FROM ( SELECT mc.cell_id, mc.value / SUM(mc.value) OVER (PARTITION BY mc.cell_id) as p FROM model_cell mc WHERE mc.value > 0 ) t WHERE p > 0 GROUP BY cell_id ")) ``` ## Generate Biodiversity Raster ```{r generate_raster} #| eval: false # load global raster template r_global <- readRDS(glue("{dir_data}/derived/r_global_0.05.rds")) # get species richness metric ID richness_metric_seq <- dbGetQuery(con_sdm, " SELECT metric_seq FROM metric WHERE metric_type = 'species_richness' ")$metric_seq # query species richness for a region richness_df <- dbGetQuery(con_sdm, glue(" SELECT cm.cell_id, cm.value as species_richness FROM cell_metric cm WHERE cm.metric_seq = {richness_metric_seq} AND cm.cell_id IN ( SELECT cell_id FROM cell WHERE lon BETWEEN -180 AND -60 -- limit to a region for testing AND lat BETWEEN 20 AND 50 ) ")) # convert to raster using the new function r_richness <- create_raster_from_df( richness_df, r_template = r_global, value_cols = "species_richness" ) # display with mapgl mapgl_rast <- function(r, palette = "viridis", layer_name = "SDM") { # convert raster to data frame r_df <- as.data.frame(r, xy = TRUE) |> rename(value = 3) # create mapgl visualization mapgl() |> add_raster_tile_layer( id = "sdm_raster", source = r, source_layer = layer_name, color_palette = palette ) } # visualize mapgl_rast(r_richness, palette = "YlOrRd", layer_name = "Species Richness") ``` ## Summarize by Planning Area ```{r summarize_planning_areas} #| eval: false # first, register planning areas as regions planning_area_regions <- dbGetQuery(con_sdm, " SELECT pa_key, pa_name FROM planning_areas ") for (i in 1:nrow(planning_area_regions)) { dbExecute(con_sdm, glue(" INSERT INTO region (tbl_col, value, col_geom) VALUES ('planning_areas.pa_key', '{planning_area_regions$pa_key[i]}', 'geom') ")) } # get region and metric IDs region_ids <- dbGetQuery(con_sdm, " SELECT rgn_seq, value FROM region WHERE tbl_col = 'planning_areas.pa_key' ") richness_metric_seq <- dbGetQuery(con_sdm, " SELECT metric_seq FROM metric WHERE metric_type = 'species_richness' ")$metric_seq # calculate average species richness per planning area message("Calculating planning area metrics...") for (i in 1:nrow(region_ids)) { pa_key <- region_ids$value[i] rgn_seq <- region_ids$rgn_seq[i] # calculate metric for this region result <- dbGetQuery(con_sdm, glue(" SELECT AVG(cm.value) as avg_value FROM planning_areas p JOIN cell c ON ST_Within(c.geom, p.geom) JOIN cell_metric cm ON c.cell_id = cm.cell_id WHERE p.pa_key = '{pa_key}' AND cm.metric_seq = {richness_metric_seq} ")) if (!is.na(result$avg_value[1])) { dbExecute(con_sdm, glue(" INSERT INTO region_metric (rgn_seq, metric_seq, value) VALUES ({rgn_seq}, {richness_metric_seq}, {result$avg_value[1]}) ")) } } # query results pa_metrics <- dbGetQuery(con_sdm, " SELECT p.pa_key, p.pa_name, rm.value as avg_richness FROM region r JOIN region_metric rm ON r.rgn_seq = rm.rgn_seq JOIN planning_areas p ON r.value = p.pa_key WHERE r.tbl_col = 'planning_areas.pa_key' AND rm.metric_seq = (SELECT metric_seq FROM metric WHERE metric_type = 'species_richness') ORDER BY rm.value DESC ") print(pa_metrics) ``` ## Add Environmental Layers ```{r add_env_layers} #| eval: false # example: add primary productivity from ingest_productivity.qmd npp_tif <- here("data/derived/vgpm-viirs_2023-daily-avg.tif") if (file.exists(npp_tif)) { r_npp <- rast(npp_tif) # resample to 0.05 degree resolution r_template <- rast( extent = ext(-180, 180, -90, 90), resolution = 0.05, crs = "EPSG:4326" ) r_npp_res <- resample(r_npp, r_template, method = "bilinear") # extract values npp_df <- as.data.frame(r_npp_res, xy = TRUE) |> filter(!is.na(npp_daily_avg)) |> mutate( cell_lon = round(x / 0.05) * 0.05, cell_lat = round(y / 0.05) * 0.05 ) # insert into env_layers # (similar process as species values) } ``` ## Export to Parquet ```{r export_parquet} #| eval: false # export tables to parquet for efficient storage output_dir <- glue("{dir_data}/derived/sdm_parquet") dir_create(output_dir) # export each table tables_to_export <- c( "dataset", "model", "species", "cell", "model_cell", "metric", "cell_metric", "region", "region_metric", "region_cell", "planning_areas" ) for (tbl in tables_to_export) { message(glue("Exporting {tbl} to parquet...")) df <- dbGetQuery(con_sdm, glue("SELECT * FROM {tbl}")) arrow::write_parquet( df, file.path(output_dir, glue("{tbl}.parquet")), compression = "snappy" ) } ``` ## Close Connection ```{r cleanup} dbDisconnect(con_sdm) ```