Fix taxon_model Duplicates

Published

2026-02-11 11:10:04

1 Problem

The taxon_model table has duplicate (taxon_id, ds_key) groups:

  • rng_iucn: exact duplicates (same mdl_seq repeated 3x per group)
  • ms_merge: distinct duplicates (different mdl_seq values; older ones are orphans not in model table)

2 Setup

Code
librarian::shelf(
  DBI,
  dplyr,
  duckdb,
  glue,
  quiet = T
)

is_server <- Sys.info()[["sysname"]] == "Linux"
dir_data <- ifelse(is_server, "/share/data", "~/My Drive/projects/msens/data")

# version
v_int <- 3
v_sfx <- paste0("_v", v_int)
sdm_db <- glue("{dir_data}/derived/sdm{v_sfx}.duckdb")

con_sdm <- dbConnect(duckdb(dbdir = sdm_db, read_only = F))

3 Before: report duplicates

Code
d_dups_before <- dbGetQuery(
  con_sdm,
  "
  SELECT ds_key, taxon_id, COUNT(*) AS n
  FROM taxon_model
  GROUP BY ds_key, taxon_id
  HAVING COUNT(*) > 1
  ORDER BY ds_key, taxon_id"
)

message(glue("Duplicate (taxon_id, ds_key) groups: {nrow(d_dups_before)}"))
Duplicate (taxon_id, ds_key) groups: 3089
Code
message(glue("Breakdown by ds_key:"))
Breakdown by ds_key:
Code
d_dups_before |>
  count(ds_key, name = "n_dup_groups") |>
  print()
    ds_key n_dup_groups
1 ms_merge         1573
2 rng_iucn         1516
Code
n_total_before <- dbGetQuery(con_sdm, "SELECT COUNT(*) AS n FROM taxon_model")$n
message(glue("Total rows before cleanup: {n_total_before}"))
Total rows before cleanup: 25335

4 Clean: remove duplicates and orphaned mdl_seq

Recreate taxon_model keeping only DISTINCT rows whose mdl_seq exists in model.

Code
# recreate without duplicates and without orphaned mdl_seq
dbExecute(
  con_sdm,
  "
  CREATE TABLE taxon_model_clean AS
  SELECT DISTINCT taxon_id, ds_key, mdl_seq
  FROM taxon_model
  WHERE mdl_seq IN (SELECT mdl_seq FROM model)"
)
[1] 20758
Code
dbExecute(con_sdm, "DROP TABLE taxon_model")
[1] 0
Code
dbExecute(con_sdm, "ALTER TABLE taxon_model_clean RENAME TO taxon_model")
[1] 0
Code
# add unique constraint to prevent future duplicates
dbExecute(
  con_sdm,
  "CREATE UNIQUE INDEX idx_taxon_model_tid_dskey ON taxon_model(taxon_id, ds_key)"
)
[1] 0
Code
message("Cleaned taxon_model and added UNIQUE index on (taxon_id, ds_key)")
Cleaned taxon_model and added UNIQUE index on (taxon_id, ds_key)

5 After: verify no duplicates remain

Code
d_dups_after <- dbGetQuery(
  con_sdm,
  "
  SELECT ds_key, taxon_id, COUNT(*) AS n
  FROM taxon_model
  GROUP BY ds_key, taxon_id
  HAVING COUNT(*) > 1"
)

n_total_after <- dbGetQuery(con_sdm, "SELECT COUNT(*) AS n FROM taxon_model")$n
message(glue("Total rows after cleanup: {n_total_after}"))
Total rows after cleanup: 20758
Code
message(glue("Rows removed: {n_total_before - n_total_after}"))
Rows removed: 4577
Code
message(glue("Remaining duplicate groups: {nrow(d_dups_after)}"))
Remaining duplicate groups: 0
Code
stopifnot(nrow(d_dups_after) == 0)

# verify unique index prevents re-insertion
tryCatch(
  {
    d_test <- dbGetQuery(
      con_sdm,
      "SELECT taxon_id, ds_key, mdl_seq FROM taxon_model LIMIT 1"
    )
    dbExecute(
      con_sdm,
      glue(
        "INSERT INTO taxon_model VALUES ({d_test$taxon_id}, '{d_test$ds_key}', {d_test$mdl_seq})"
      )
    )
    stop("UNIQUE constraint did NOT prevent duplicate insertion!")
  },
  error = function(e) {
    if (grepl("Duplicate key|UNIQUE|unique", e$message)) {
      message("UNIQUE index verified: duplicate INSERT correctly rejected")
    } else {
      stop(e)
    }
  }
)
UNIQUE index verified: duplicate INSERT correctly rejected

6 Disconnect

Code
dbDisconnect(con_sdm, shutdown = TRUE)