d_dups_before <-dbGetQuery( con_sdm," SELECT ds_key, taxon_id, COUNT(*) AS n FROM taxon_model GROUP BY ds_key, taxon_id HAVING COUNT(*) > 1 ORDER BY ds_key, taxon_id")message(glue("Duplicate (taxon_id, ds_key) groups: {nrow(d_dups_before)}"))
Duplicate (taxon_id, ds_key) groups: 3089
Code
message(glue("Breakdown by ds_key:"))
Breakdown by ds_key:
Code
d_dups_before |>count(ds_key, name ="n_dup_groups") |>print()
n_total_before <-dbGetQuery(con_sdm, "SELECT COUNT(*) AS n FROM taxon_model")$nmessage(glue("Total rows before cleanup: {n_total_before}"))
Total rows before cleanup: 25335
4 Clean: remove duplicates and orphaned mdl_seq
Recreate taxon_model keeping only DISTINCT rows whose mdl_seq exists in model.
Code
# recreate without duplicates and without orphaned mdl_seqdbExecute( con_sdm," CREATE TABLE taxon_model_clean AS SELECT DISTINCT taxon_id, ds_key, mdl_seq FROM taxon_model WHERE mdl_seq IN (SELECT mdl_seq FROM model)")
[1] 20758
Code
dbExecute(con_sdm, "DROP TABLE taxon_model")
[1] 0
Code
dbExecute(con_sdm, "ALTER TABLE taxon_model_clean RENAME TO taxon_model")
[1] 0
Code
# add unique constraint to prevent future duplicatesdbExecute( con_sdm,"CREATE UNIQUE INDEX idx_taxon_model_tid_dskey ON taxon_model(taxon_id, ds_key)")
[1] 0
Code
message("Cleaned taxon_model and added UNIQUE index on (taxon_id, ds_key)")
Cleaned taxon_model and added UNIQUE index on (taxon_id, ds_key)
5 After: verify no duplicates remain
Code
d_dups_after <-dbGetQuery( con_sdm," SELECT ds_key, taxon_id, COUNT(*) AS n FROM taxon_model GROUP BY ds_key, taxon_id HAVING COUNT(*) > 1")n_total_after <-dbGetQuery(con_sdm, "SELECT COUNT(*) AS n FROM taxon_model")$nmessage(glue("Total rows after cleanup: {n_total_after}"))