From b8a97804df30b77e8dc89a898d6e210ddf8cd31d Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 08:58:53 -0500 Subject: [PATCH 01/18] TRY: Add `try2sqlite` function. Move old workflow. --- base/db/.Rbuildignore | 1 + base/db/.gitignore | 1 + base/db/DESCRIPTION | 4 +- base/db/NAMESPACE | 1 + base/db/R/try2sqlite.R | 80 +++++++++++++++++++ base/db/inst/import-try/01_try_sqlite.R | 27 +++++++ ...{01.global.subset.R => 91.global.subset.R} | 0 ...fic.subset.R => 92.data.specific.subset.R} | 0 ...eate.try.sites.R => 93.create.try.sites.R} | 0 ...{04.match.species.R => 94.match.species.R} | 0 .../{05.citations.R => 95.citations.R} | 0 .../{06.load.data.R => 96.load.data.R} | 0 base/db/man/try2sqlite.Rd | 26 ++++++ 13 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 base/db/.gitignore create mode 100644 base/db/R/try2sqlite.R create mode 100644 base/db/inst/import-try/01_try_sqlite.R rename base/db/inst/import-try/{01.global.subset.R => 91.global.subset.R} (100%) rename base/db/inst/import-try/{02.data.specific.subset.R => 92.data.specific.subset.R} (100%) rename base/db/inst/import-try/{03.create.try.sites.R => 93.create.try.sites.R} (100%) rename base/db/inst/import-try/{04.match.species.R => 94.match.species.R} (100%) rename base/db/inst/import-try/{05.citations.R => 95.citations.R} (100%) rename base/db/inst/import-try/{06.load.data.R => 96.load.data.R} (100%) create mode 100644 base/db/man/try2sqlite.Rd diff --git a/base/db/.Rbuildignore b/base/db/.Rbuildignore index 91114bf2f2b..149d2856797 100644 --- a/base/db/.Rbuildignore +++ b/base/db/.Rbuildignore @@ -1,2 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ +try.sqlite diff --git a/base/db/.gitignore b/base/db/.gitignore new file mode 100644 index 00000000000..bcf2011eddd --- /dev/null +++ b/base/db/.gitignore @@ -0,0 +1 @@ +try.sqlite diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index 0e1d9183067..62cb1a48523 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -25,7 +25,9 @@ Imports: udunits2 Suggests: RPostgreSQL, - testthat (>= 1.0.2) + RSQLite, + testthat (>= 1.0.2), + data.table License: FreeBSD + file LICENSE Copyright: Authors LazyLoad: yes diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index 78717a7f0b4..d9d61ef7ff2 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -44,6 +44,7 @@ export(query.traits) export(rename_jags_columns) export(runs) export(take.samples) +export(try2sqlite) export(var_names_all) export(workflow) export(workflows) diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R new file mode 100644 index 00000000000..633bb499a48 --- /dev/null +++ b/base/db/R/try2sqlite.R @@ -0,0 +1,80 @@ +#' Convert TRY text file to SQLite database +#' +#' The TRY file is huge and unnecessarily long, which makes it difficult to +#' work with. The resulting SQLite database is much smaller on disk, and can be +#' read much faster thanks to lazy evaluation. +#' +#' The resulting TRY SQLite database contains the following tables: +#' - `values` -- The actual TRY data. Links to all other tables through ID columns. +#' - `traits` -- +#' - `datasets` +#' - `species` +#' +#' @param try_files Character vector of file names containing TRY data. +#' Multiple files will be `rbind`-ed together. +#' @param sqlite_file Target SQLite database file name, as character. +#' @export +try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { + # Read files + raw_data <- Map(data.table::fread, try_files) %>% + data.table::rbindlist() + + # Create integer reference ID for compact storage + raw_data[, ReferenceID := as.integer(factor(Reference))] + + # Create tables + data_cols <- c( + "ObsDataID", # TRY row ID -- unique to each observation of a given trait + "ObservationID", # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf) + "DataID", # Links to data ID + "StdValue", # Standardized, QA-QC'ed value + "UnitName", # Standardized unit + "AccSpeciesID", # Link to 'species' table + "DatasetID", # Link to 'datasets' table. + "ReferenceID", # Link to 'try_references' table. + "ValueKindName", # Type of value, e.g. mean, min, max, etc. + "UncertaintyName", # Kind of uncertainty + "Replicates", # Number of replicates + "RelUncertaintyPercent", + "OrigValueStr", # Original data, as character string (before QA/QC) + "OrigUnitStr", # Original unit, as character string (before QA/QC) + "OrigUncertaintyStr" # Original uncertainty, as character string (before QA/QC) + ) + data_values <- unique(raw_data[, data_cols, with = FALSE]) + data_values[, ] + + datasets_cols <- c( + "DatasetID", + "Dataset", + "LastName", + "FirstName", + "Reference" + ) + datasets_values <- unique(raw_data[, datasets_cols, with = FALSE]) + datasets_values[, doi := character()] # Add DOI column, to be filled later + + traits_cols <- c( + "DataID", + "DataName", + "TraitID", + "TraitName" + ) + traits_values <- unique(raw_data[, traits_cols, with = FALSE]) + + species_cols <- c( + "AccSpeciesID", + "AccSpeciesName", + "SpeciesName" + ) + species_values <- unique(raw_data[, species_cols, with = FALSE]) + species_values[, BetySpeciesID := integer()] # Add BETY ID column, to populate later + + con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) + on.exit(DBI::dbDisconnect(con)) + DBI::dbWriteTable(con, "values", data_values) + DBI::dbWriteTable(con, "traits", data_values) + DBI::dbWriteTable(con, "datasets", datasets_values) + DBI::dbWriteTable(con, "species", species_values) + + NULL +} diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R new file mode 100644 index 00000000000..4010e1112a9 --- /dev/null +++ b/base/db/inst/import-try/01_try_sqlite.R @@ -0,0 +1,27 @@ +# --- +# title: Example TRY import workflow +# author: Alexey Shiklomanov +# --- + +# Create the TRY SQLite database +try_files <- "~/Projects/try/try-data/1829.txt" +sqlite_file <- "try.sqlite" + +if (!file.exists(sqlite_file)) { + try2sqlite(try_files, sqlite_file) +} + +trydb <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) +bety <- DBI::dbConnect( + RPostgres::Postgres(), + dbname = "bety", + host = "test-pecan.bu.edu", + user = "bety", + password = "bety" +) + +# Add citations + +# Add sites + +# Add data diff --git a/base/db/inst/import-try/01.global.subset.R b/base/db/inst/import-try/91.global.subset.R similarity index 100% rename from base/db/inst/import-try/01.global.subset.R rename to base/db/inst/import-try/91.global.subset.R diff --git a/base/db/inst/import-try/02.data.specific.subset.R b/base/db/inst/import-try/92.data.specific.subset.R similarity index 100% rename from base/db/inst/import-try/02.data.specific.subset.R rename to base/db/inst/import-try/92.data.specific.subset.R diff --git a/base/db/inst/import-try/03.create.try.sites.R b/base/db/inst/import-try/93.create.try.sites.R similarity index 100% rename from base/db/inst/import-try/03.create.try.sites.R rename to base/db/inst/import-try/93.create.try.sites.R diff --git a/base/db/inst/import-try/04.match.species.R b/base/db/inst/import-try/94.match.species.R similarity index 100% rename from base/db/inst/import-try/04.match.species.R rename to base/db/inst/import-try/94.match.species.R diff --git a/base/db/inst/import-try/05.citations.R b/base/db/inst/import-try/95.citations.R similarity index 100% rename from base/db/inst/import-try/05.citations.R rename to base/db/inst/import-try/95.citations.R diff --git a/base/db/inst/import-try/06.load.data.R b/base/db/inst/import-try/96.load.data.R similarity index 100% rename from base/db/inst/import-try/06.load.data.R rename to base/db/inst/import-try/96.load.data.R diff --git a/base/db/man/try2sqlite.Rd b/base/db/man/try2sqlite.Rd new file mode 100644 index 00000000000..65f963e8bf3 --- /dev/null +++ b/base/db/man/try2sqlite.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/try2sqlite.R +\name{try2sqlite} +\alias{try2sqlite} +\title{Convert TRY text file to SQLite database} +\usage{ +try2sqlite(try_files, sqlite_file = "try.sqlite") +} +\arguments{ +\item{try_files}{Character vector of file names containing TRY data. +Multiple files will be `rbind`-ed together.} + +\item{sqlite_file}{Target SQLite database file name, as character.} +} +\description{ +The TRY file is huge and unnecessarily long, which makes it difficult to +work with. The resulting SQLite database is much smaller on disk, and can be +read much faster thanks to lazy evaluation. +} +\details{ +The resulting TRY SQLite database contains the following tables: + - `values` -- The actual TRY data. Links to all other tables through ID columns. + - `traits` -- + - `datasets` + - `species` +} From fe89c99bb0784489617b56e75e3b560a1b95dec0 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 11:33:42 -0600 Subject: [PATCH 02/18] DB: Fix `data.table` issues in try2sqlite --- base/db/.Rbuildignore | 2 +- base/db/R/try2sqlite.R | 19 ++++++++------- base/db/inst/import-try/01_try_sqlite.R | 31 +++++++++---------------- base/db/man/try2sqlite.Rd | 8 +++---- 4 files changed, 27 insertions(+), 33 deletions(-) diff --git a/base/db/.Rbuildignore b/base/db/.Rbuildignore index 149d2856797..02548d35d12 100644 --- a/base/db/.Rbuildignore +++ b/base/db/.Rbuildignore @@ -1,3 +1,3 @@ ^.*\.Rproj$ ^\.Rproj\.user$ -try.sqlite +try\.sqlite diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R index 633bb499a48..ffbcb9fe0bc 100644 --- a/base/db/R/try2sqlite.R +++ b/base/db/R/try2sqlite.R @@ -6,12 +6,12 @@ #' #' The resulting TRY SQLite database contains the following tables: #' - `values` -- The actual TRY data. Links to all other tables through ID columns. -#' - `traits` -- -#' - `datasets` -#' - `species` +#' - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table. +#' - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`. +#' - `species` -- Species. Links to `values` through `AccSpeciesID`. #' #' @param try_files Character vector of file names containing TRY data. -#' Multiple files will be `rbind`-ed together. +#' Multiple files are combined with `data.table::rbindlist`. #' @param sqlite_file Target SQLite database file name, as character. #' @export try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { @@ -20,7 +20,11 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { data.table::rbindlist() # Create integer reference ID for compact storage - raw_data[, ReferenceID := as.integer(factor(Reference))] + data.table::set( + raw_data, + j = "ReferenceID", + value = as.integer(factor(raw_data[["Reference"]])) + ) # Create tables data_cols <- c( @@ -41,7 +45,6 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { "OrigUncertaintyStr" # Original uncertainty, as character string (before QA/QC) ) data_values <- unique(raw_data[, data_cols, with = FALSE]) - data_values[, ] datasets_cols <- c( "DatasetID", @@ -51,7 +54,8 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { "Reference" ) datasets_values <- unique(raw_data[, datasets_cols, with = FALSE]) - datasets_values[, doi := character()] # Add DOI column, to be filled later + # Add DOI column, to be filled later + datasets_values[["doi"]] <- NA_character_ traits_cols <- c( "DataID", @@ -67,7 +71,6 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { "SpeciesName" ) species_values <- unique(raw_data[, species_cols, with = FALSE]) - species_values[, BetySpeciesID := integer()] # Add BETY ID column, to populate later con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) on.exit(DBI::dbDisconnect(con)) diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R index 4010e1112a9..6dcd932b343 100644 --- a/base/db/inst/import-try/01_try_sqlite.R +++ b/base/db/inst/import-try/01_try_sqlite.R @@ -1,27 +1,18 @@ -# --- -# title: Example TRY import workflow -# author: Alexey Shiklomanov -# --- +# Create the TRY SQLite database +library(PEcAn.DB) # Create the TRY SQLite database -try_files <- "~/Projects/try/try-data/1829.txt" +try_files <- "~/try-data/1829.txt" sqlite_file <- "try.sqlite" -if (!file.exists(sqlite_file)) { - try2sqlite(try_files, sqlite_file) +if (!exists("overwrite")) { + overwrite <- FALSE } -trydb <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) -bety <- DBI::dbConnect( - RPostgres::Postgres(), - dbname = "bety", - host = "test-pecan.bu.edu", - user = "bety", - password = "bety" -) - -# Add citations - -# Add sites +if (!file.exists(sqlite_file) || overwrite) { + try2sqlite(try_files, sqlite_file) +} -# Add data +if (!exists("force_add_doi")) { + force_add_doi <- FALSE +} diff --git a/base/db/man/try2sqlite.Rd b/base/db/man/try2sqlite.Rd index 65f963e8bf3..e4b6407712b 100644 --- a/base/db/man/try2sqlite.Rd +++ b/base/db/man/try2sqlite.Rd @@ -8,7 +8,7 @@ try2sqlite(try_files, sqlite_file = "try.sqlite") } \arguments{ \item{try_files}{Character vector of file names containing TRY data. -Multiple files will be `rbind`-ed together.} +Multiple files are combined with `data.table::rbindlist`.} \item{sqlite_file}{Target SQLite database file name, as character.} } @@ -20,7 +20,7 @@ read much faster thanks to lazy evaluation. \details{ The resulting TRY SQLite database contains the following tables: - `values` -- The actual TRY data. Links to all other tables through ID columns. - - `traits` -- - - `datasets` - - `species` + - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table. + - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`. + - `species` -- Species. Links to `values` through `AccSpeciesID`. } From e1128e3626c2c29169200eccba304c1e82d2a895 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 11:34:09 -0600 Subject: [PATCH 03/18] DB: Add `search_references` functions --- base/db/DESCRIPTION | 4 +- base/db/NAMESPACE | 1 + base/db/R/search_references.R | 66 ++++++++++++++++++++++++++ base/db/inst/import-try/02_citations.R | 19 ++++++++ base/db/inst/import-try/README.md | 7 ++- base/db/inst/import-try/bety_connect.R | 6 +++ base/db/man/search_reference_single.Rd | 21 ++++++++ base/db/man/search_references.Rd | 24 ++++++++++ 8 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 base/db/R/search_references.R create mode 100644 base/db/inst/import-try/02_citations.R create mode 100644 base/db/inst/import-try/bety_connect.R create mode 100644 base/db/man/search_reference_single.Rd create mode 100644 base/db/man/search_references.Rd diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index 62cb1a48523..8dabca46856 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -27,7 +27,9 @@ Suggests: RPostgreSQL, RSQLite, testthat (>= 1.0.2), - data.table + tidyverse, + data.table, + rcrossref License: FreeBSD + file LICENSE Copyright: Authors LazyLoad: yes diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index d9d61ef7ff2..b06517ac3c5 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -43,6 +43,7 @@ export(query.trait.data) export(query.traits) export(rename_jags_columns) export(runs) +export(search_references) export(take.samples) export(try2sqlite) export(var_names_all) diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R new file mode 100644 index 00000000000..717fb47c5ba --- /dev/null +++ b/base/db/R/search_references.R @@ -0,0 +1,66 @@ +#' Perform crossref search for a list of references +#' +#' @param queries Character vector of queries +#' @inheritDotParams search_reference_single +#' @inherit search_reference_single description return +#' @export +search_references <- function(queries, ...) { + search_fun <- search_reference_single %>% + purrr::partial(...) %>% + purrr::possibly(otherwise = data.frame(title = "Not found")) + encodeString(queries) %>% + purrr::map_dfr(search_fun) +} + +#' Perform crossref search for a single reference +#' +#' Requires the `rcrossref` package. +#' +#' @param query Citation string (length 1) to search for DOI +#' @param min_score Minimum match score. Default (85) is fairly strict. +#' @param limit Number of results to return +#' @return `data.frame` containing crossref information converted to match bety citations table. +search_reference_single <- function(query, limit = 1, min_score = 85) { + stopifnot(length(query) == 1) + PEcAn.logger::logger.debug("Processing query:\n", query) + crsearch <- rcrossref::cr_works(query = query, limit = limit) + if (is.null(crsearch[["data"]])) { + PEcAn.logger::logger.warn( + "Error in crossref query. ", + "Setting title to search string and leaving other fields blank." + ) + return(tibble::tibble(query = query)) + } + crdata <- crsearch[["data"]] %>% + dplyr::mutate(score = as.numeric(score)) %>% + dplyr::filter(score > min_score) + if (nrow(crdata) < 1) { + PEcAn.logger::logger.info( + "No matches found. ", + "Setting title to search string and leaving other fields blank.") + return(tibble::tibble(query = query)) + } + keep_cols <- c( + "author", + "year", + "title", + journal = "container.title", + vol = "volume", + pg = "page", + doi = "DOI", + "score", + "query" + ) + proc_search <- crsearch %>% + dplyr::mutate( + author = purrr::map_chr( + author, + ~paste(.[["given"]], .[["family"]], sep = " ", collapse = ", ")), + year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(), + query = query, + score = as.numeric(score) + ) + use_cols <- keep_cols[keep_cols %in% colnames(proc_search)] + select(proc_search, !!!use_cols) +} + diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R new file mode 100644 index 00000000000..5039c078c0b --- /dev/null +++ b/base/db/inst/import-try/02_citations.R @@ -0,0 +1,19 @@ +# Add DOIs to TRY citations +library(tidyverse) +library(DBI) +library(RSQLite) +library(rcrossref) +library(PEcAn.logger) + +sqlite_file <- "try.sqlite" + +trydb <- dbConnect(SQLite(), sqlite_file) +references <- trydb %>% + tbl("datasets") %>% + distinct(Reference) %>% + pull(Reference) + +references <- readLines("references")[1:15] + +logger.setLevel("DEBUG") +tidy_refs <- search_references(references) diff --git a/base/db/inst/import-try/README.md b/base/db/inst/import-try/README.md index 33ec4ade800..7ad5b00de76 100644 --- a/base/db/inst/import-try/README.md +++ b/base/db/inst/import-try/README.md @@ -1,8 +1,13 @@ --- -title: "Import TRY database into BETY"" +title: "Import TRY database into BETY" Author: "Alexey Shiklomanov" --- +# Workflow + +1. `01_try_sqlite.R` -- Read TRY data files (huge tab-delimited text files) and convert them to an SQLite database, which is much faster and easier to work with. +2. `02_citations.R` -- Add DOIs to TRY references, if possible. + # Package dependencies 1. `data.table` -- Makes it remotely possible to work with the TRY database. Requires an up-to-date version, so if parts of the workflow break, try re-installing this. 2. `bit64` -- Used by `data.table` to read and store large integers, which constitude most of the ID's in TRY and BETY. diff --git a/base/db/inst/import-try/bety_connect.R b/base/db/inst/import-try/bety_connect.R new file mode 100644 index 00000000000..89d20b37f12 --- /dev/null +++ b/base/db/inst/import-try/bety_connect.R @@ -0,0 +1,6 @@ +bety <- PEcAn.DB::db.open(list( + host = "localhost", + dbname = "bety", + user = "bety", + password = "bety" +)) diff --git a/base/db/man/search_reference_single.Rd b/base/db/man/search_reference_single.Rd new file mode 100644 index 00000000000..c689ccdb86d --- /dev/null +++ b/base/db/man/search_reference_single.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search_references.R +\name{search_reference_single} +\alias{search_reference_single} +\title{Perform crossref search for a single reference} +\usage{ +search_reference_single(query, limit = 1, min_score = 85) +} +\arguments{ +\item{query}{Citation string (length 1) to search for DOI} + +\item{limit}{Number of results to return} + +\item{min_score}{Minimum match score. Default (85) is fairly strict.} +} +\value{ +`data.frame` containing crossref information converted to match bety citations table. +} +\description{ +Requires the `rcrossref` package. +} diff --git a/base/db/man/search_references.Rd b/base/db/man/search_references.Rd new file mode 100644 index 00000000000..8af809bf7a1 --- /dev/null +++ b/base/db/man/search_references.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/search_references.R +\name{search_references} +\alias{search_references} +\title{Perform crossref search for a list of references} +\usage{ +search_references(queries, ...) +} +\arguments{ +\item{queries}{Character vector of queries} + +\item{...}{Arguments passed on to \code{search_reference_single} +\describe{ + \item{query}{Citation string (length 1) to search for DOI} + \item{min_score}{Minimum match score. Default (85) is fairly strict.} + \item{limit}{Number of results to return} +}} +} +\value{ +`data.frame` containing crossref information converted to match bety citations table. +} +\description{ +Perform crossref search for a list of references +} From dbad6dc0e9d5693da78bf821bef111caa6c0faa9 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 11:47:09 -0600 Subject: [PATCH 04/18] DB: Bugfix search_references --- base/db/R/search_references.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R index 717fb47c5ba..86f5e4150f2 100644 --- a/base/db/R/search_references.R +++ b/base/db/R/search_references.R @@ -51,7 +51,7 @@ search_reference_single <- function(query, limit = 1, min_score = 85) { "score", "query" ) - proc_search <- crsearch %>% + proc_search <- crdata %>% dplyr::mutate( author = purrr::map_chr( author, From 004f0fef643df5a797aa3b12e3eaeca567e053bd Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 12:37:39 -0600 Subject: [PATCH 05/18] DB: Add `insert_table` function --- base/db/NAMESPACE | 1 + base/db/R/insert_table.R | 67 +++++++++++++++++++++++++++++++ base/db/man/build_insert_query.Rd | 18 +++++++++ base/db/man/insert_table.Rd | 30 ++++++++++++++ base/db/man/match_dbcols.Rd | 18 +++++++++ 5 files changed, 134 insertions(+) create mode 100644 base/db/R/insert_table.R create mode 100644 base/db/man/build_insert_query.Rd create mode 100644 base/db/man/insert_table.Rd create mode 100644 base/db/man/match_dbcols.Rd diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index b06517ac3c5..2724becb5e9 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -32,6 +32,7 @@ export(get_run_ids) export(get_users) export(get_var_names) export(get_workflow_ids) +export(insert_table) export(load_data_single_run) export(ncdays2date) export(query.file.path) diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R new file mode 100644 index 00000000000..0a8b23fdc5b --- /dev/null +++ b/base/db/R/insert_table.R @@ -0,0 +1,67 @@ +#' Insert R data frame into SQL database +#' +#' First, subset to matching columns. Then, build an SQL string for the insert +#' statement. Finally, insert into the database. +#' +#' @param values `data.frame` of values to write to SQL database +#' @param table Name of target SQL table, as character +#' @inheritParams db.query +#' @inherit db.query return +#' @export +#' @examples +#' library(dplyr) +#' irisfile <- tempfile(fileext = ".sqlite") +#' irisdb <- src_sqlite(irisfile, create = TRUE) +#' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) +#' insert_table(iris[-1,], "iris", irisdb) +#' tbl(irisdb, "iris") +insert_table <- function(values, table, con) { + use_cols <- match_dbcols(values, table, con) + if (length(use_cols) < 1) { + PEcAn.logger::logger.severe( + "No columns match between input and target table." + ) + } + PEcAn.logger::logger.debug( + "Matched the following cols: ", + paste(use_cols, collapse = ", ") + ) + values_sub <- values[, use_cols] + insert_query <- build_insert_query(values, table, con = con) + db.query(con, insert_query) +} + +#' Match names of local data frame to SQL table +#' +#' @inheritParams insert_table +match_dbcols <- function(values, table, con) { + tbl_db <- dplyr::tbl(con, table) + table_cols <- dplyr::tbl_vars(tbl_db) + values_cols <- colnames(values) + intersect(values_cols, table_cols) +} + +#' Build query to insert R data frame into SQL table +#' +#' @inheritParams insert_table +#' @param ... Additional arguments to [dbplyr::build_sql] +build_insert_query <- function(values, table, ...) { + value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ])) + + insert_list <- value_list %>% + purrr::map(unname) %>% + purrr::map(dbplyr::escape) %>% + purrr::map(dbplyr::sql_vector) + + dbplyr::build_sql( + dbplyr::sql("INSERT INTO"), + dbplyr::sql(" "), + dbplyr::ident(table), + dbplyr::sql(" "), + dbplyr::sql_vector(dbplyr::escape(colnames(values))), + dbplyr::sql(" "), + dbplyr::sql("VALUES"), + dbplyr::sql(" "), + dbplyr::sql_vector(insert_list, parens = FALSE, collapse = ", "), ... + ) +} diff --git a/base/db/man/build_insert_query.Rd b/base/db/man/build_insert_query.Rd new file mode 100644 index 00000000000..a11bdc46c84 --- /dev/null +++ b/base/db/man/build_insert_query.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{build_insert_query} +\alias{build_insert_query} +\title{Build query to insert R data frame into SQL table} +\usage{ +build_insert_query(values, table, ...) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{...}{Additional arguments to [dbplyr::build_sql]} +} +\description{ +Build query to insert R data frame into SQL table +} diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd new file mode 100644 index 00000000000..f63acf7cc70 --- /dev/null +++ b/base/db/man/insert_table.Rd @@ -0,0 +1,30 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{insert_table} +\alias{insert_table} +\title{Insert R data frame into SQL database} +\usage{ +insert_table(values, table, con) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} +} +\value{ +data frame with query results +} +\description{ +First, subset to matching columns. Then, build an SQL string for the insert +statement. Finally, insert into the database. +} +\examples{ +library(dplyr) +irisfile <- tempfile(fileext = ".sqlite") +irisdb <- src_sqlite(irisfile, create = TRUE) +copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) +insert_table(iris[-1,], "iris", irisdb) +tbl(irisdb, "iris") +} diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd new file mode 100644 index 00000000000..d70b74717ae --- /dev/null +++ b/base/db/man/match_dbcols.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{match_dbcols} +\alias{match_dbcols} +\title{Match names of local data frame to SQL table} +\usage{ +match_dbcols(values, table, con) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} +} +\description{ +Match names of local data frame to SQL table +} From 1a0c15448a6e947812f517a3279647c64bbf79b9 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 13:01:07 -0600 Subject: [PATCH 06/18] DB: Add `db_merge_into` function. Also, bugfix `insert_table`. --- base/db/R/db_merge_into.R | 26 ++++++++++++++++++++++++++ base/db/R/insert_table.R | 4 ++-- base/db/man/db_merge_into.Rd | 29 +++++++++++++++++++++++++++++ base/db/man/insert_table.Rd | 2 +- 4 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 base/db/R/db_merge_into.R create mode 100644 base/db/man/db_merge_into.Rd diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R new file mode 100644 index 00000000000..64bb859b956 --- /dev/null +++ b/base/db/R/db_merge_into.R @@ -0,0 +1,26 @@ +#' Merge local data frame into SQL table +#' +#' @inheritParams insert_table +#' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values` +#' @examples +#' library(dplyr) +#' library(RSQLite) +#' irisfile <- tempfile(fileext = ".sqlite") +#' irisdb <- dbConnect(SQLite(), irisfile) +#' copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) +#' db_merge_into(iris[1:12,], "iris", irisdb) +#' tbl(irisdb, "iris") %>% count() +db_merge_into <- function(values, table, con, by = NULL) { + if (is.null(by)) { + by <- match_dbcols(values, table, con) + } + sql_tbl <- dplyr::tbl(con, table) + values_merge <- dplyr::anti_join(values, sql_tbl, by = by, copy = TRUE) + if (nrow(values_merge) < 1 || ncol(values_merge) < 1) { + PEcAn.logger::logger.warn( + "Input table for merge is empty. Skipping operation and returning NULL." + ) + return(NULL) + } + insert_table(values_merge, table, con) +} diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R index 0a8b23fdc5b..0d833526d23 100644 --- a/base/db/R/insert_table.R +++ b/base/db/R/insert_table.R @@ -13,7 +13,7 @@ #' irisfile <- tempfile(fileext = ".sqlite") #' irisdb <- src_sqlite(irisfile, create = TRUE) #' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) -#' insert_table(iris[-1,], "iris", irisdb) +#' insert_table(iris[-1,], "iris", irisdb$con) #' tbl(irisdb, "iris") insert_table <- function(values, table, con) { use_cols <- match_dbcols(values, table, con) @@ -28,7 +28,7 @@ insert_table <- function(values, table, con) { ) values_sub <- values[, use_cols] insert_query <- build_insert_query(values, table, con = con) - db.query(con, insert_query) + db.query(insert_query, con) } #' Match names of local data frame to SQL table diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd new file mode 100644 index 00000000000..ebd4c201ec6 --- /dev/null +++ b/base/db/man/db_merge_into.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/db_merge_into.R +\name{db_merge_into} +\alias{db_merge_into} +\title{Merge local data frame into SQL table} +\usage{ +db_merge_into(values, table, con, by = NULL) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} + +\item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`} +} +\description{ +Merge local data frame into SQL table +} +\examples{ +library(dplyr) +library(RSQLite) +irisfile <- tempfile(fileext = ".sqlite") +irisdb <- dbConnect(SQLite(), irisfile) +copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) +db_merge_into(iris[1:12,], "iris", irisdb) +tbl(irisdb, "iris") \%>\% count() +} diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd index f63acf7cc70..01e392b842a 100644 --- a/base/db/man/insert_table.Rd +++ b/base/db/man/insert_table.Rd @@ -25,6 +25,6 @@ library(dplyr) irisfile <- tempfile(fileext = ".sqlite") irisdb <- src_sqlite(irisfile, create = TRUE) copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) -insert_table(iris[-1,], "iris", irisdb) +insert_table(iris[-1,], "iris", irisdb$con) tbl(irisdb, "iris") } From cafbdda47addadc885d309041d7609afcf55152e Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 14:29:02 -0600 Subject: [PATCH 07/18] DB: Minor additions to citations script --- base/db/inst/import-try/02_citations.R | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R index 5039c078c0b..b831c40ddda 100644 --- a/base/db/inst/import-try/02_citations.R +++ b/base/db/inst/import-try/02_citations.R @@ -4,6 +4,9 @@ library(DBI) library(RSQLite) library(rcrossref) library(PEcAn.logger) +library(PEcAn.DB) + +logger.setLevel("DEBUG") sqlite_file <- "try.sqlite" @@ -13,7 +16,8 @@ references <- trydb %>% distinct(Reference) %>% pull(Reference) -references <- readLines("references")[1:15] +try_refs <- search_references(references, min_score = 1) -logger.setLevel("DEBUG") -tidy_refs <- search_references(references) +source("inst/import-try/bety_connect.R") + +# TODO: Use object `bety`, table `citations` From d339e1a269bb54d0e9a4852420b3fc62c2118e89 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 15:07:22 -0600 Subject: [PATCH 08/18] DB: Make db_merge_into output table This is to catch the automatic addition of IDs and other timestamps, for instance by Postgres. Also, add insert function unit tests. --- base/db/NAMESPACE | 1 + base/db/R/db_merge_into.R | 6 ++++- base/db/man/db_merge_into.Rd | 3 +++ base/db/tests/testthat/test.insert.R | 40 ++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 base/db/tests/testthat/test.insert.R diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index 2724becb5e9..a10100f4152 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -12,6 +12,7 @@ export(db.print.connections) export(db.query) export(db.showQueries) export(dbHostInfo) +export(db_merge_into) export(dbfile.check) export(dbfile.file) export(dbfile.id) diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R index 64bb859b956..8f9d7dec52f 100644 --- a/base/db/R/db_merge_into.R +++ b/base/db/R/db_merge_into.R @@ -2,6 +2,8 @@ #' #' @inheritParams insert_table #' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values` +#' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) +#' @export #' @examples #' library(dplyr) #' library(RSQLite) @@ -22,5 +24,7 @@ db_merge_into <- function(values, table, con, by = NULL) { ) return(NULL) } - insert_table(values_merge, table, con) + insert <- insert_table(values_merge, table, con) + dplyr::tbl(con, table) %>% + dplyr::inner_join(values, copy = TRUE) } diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd index ebd4c201ec6..764dc5a92ad 100644 --- a/base/db/man/db_merge_into.Rd +++ b/base/db/man/db_merge_into.Rd @@ -15,6 +15,9 @@ db_merge_into(values, table, con, by = NULL) \item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`} } +\value{ +Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) +} \description{ Merge local data frame into SQL table } diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R new file mode 100644 index 00000000000..76509ccb282 --- /dev/null +++ b/base/db/tests/testthat/test.insert.R @@ -0,0 +1,40 @@ +library(PEcAn.DB) +library(DBI) +library(testthat) +library(dplyr) +context("SQL insertion helper functions") + +if (requireNamespace("RSQLite")) { + library(RSQLite) + iris_file <- tempfile(fileext = ".sqlite") + irisdb <- dbConnect(SQLite(), iris_file) + iris <- as_tibble(iris) %>% + mutate(Species = as.character(Species)) + + copy_to(irisdb, iris[1,], "iris", overwrite = TRUE) + iris_insert <- iris[2:10,] + .insert <- insert_table(iris_insert, "iris", irisdb) + iris_insert_test <- tbl(irisdb, "iris") %>% collect() + test_that( + "Subset of iris was inserted into database", + { + expect_equal(iris[1:10,], iris_insert_test) + } + ) + + iris_merge <- iris[5:12,] + out_merge <- db_merge_into(iris_merge, "iris", irisdb) + iris_merge_nrow <- tbl(irisdb, "iris") %>% + count() %>% + pull(n) + test_that( + "Only subset of iris data were merged", + { + expect_equal(collect(out_merge), iris[5:12,]) + } + ) + +} else { + message("Skipping insert tests because `RSQLite` not installed.") +} + From 72af94edaf7435da9db25f6d73f231d9ac9540ef Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 15:18:34 -0600 Subject: [PATCH 09/18] DB: Bugfix `insert_table` Only use columns that match the SQL database. --- base/db/R/db_merge_into.R | 6 +++--- base/db/R/insert_table.R | 2 +- base/db/tests/testthat/test.insert.R | 19 ++++++++++++++++--- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R index 8f9d7dec52f..a387769cf96 100644 --- a/base/db/R/db_merge_into.R +++ b/base/db/R/db_merge_into.R @@ -20,11 +20,11 @@ db_merge_into <- function(values, table, con, by = NULL) { values_merge <- dplyr::anti_join(values, sql_tbl, by = by, copy = TRUE) if (nrow(values_merge) < 1 || ncol(values_merge) < 1) { PEcAn.logger::logger.warn( - "Input table for merge is empty. Skipping operation and returning NULL." + "Input table for merge is empty." ) - return(NULL) + } else { + insert <- insert_table(values_merge, table, con) } - insert <- insert_table(values_merge, table, con) dplyr::tbl(con, table) %>% dplyr::inner_join(values, copy = TRUE) } diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R index 0d833526d23..ca0f9593304 100644 --- a/base/db/R/insert_table.R +++ b/base/db/R/insert_table.R @@ -27,7 +27,7 @@ insert_table <- function(values, table, con) { paste(use_cols, collapse = ", ") ) values_sub <- values[, use_cols] - insert_query <- build_insert_query(values, table, con = con) + insert_query <- build_insert_query(values_sub, table, con = con) db.query(insert_query, con) } diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R index 76509ccb282..6df2179c156 100644 --- a/base/db/tests/testthat/test.insert.R +++ b/base/db/tests/testthat/test.insert.R @@ -9,10 +9,14 @@ if (requireNamespace("RSQLite")) { iris_file <- tempfile(fileext = ".sqlite") irisdb <- dbConnect(SQLite(), iris_file) iris <- as_tibble(iris) %>% - mutate(Species = as.character(Species)) + mutate( + Species = as.character(Species) + ) copy_to(irisdb, iris[1,], "iris", overwrite = TRUE) - iris_insert <- iris[2:10,] + # Add extra column to see if it's successfully ignored + iris2 <- mutate(iris, extracol = row_number()) + iris_insert <- iris2[2:10,] .insert <- insert_table(iris_insert, "iris", irisdb) iris_insert_test <- tbl(irisdb, "iris") %>% collect() test_that( @@ -22,8 +26,9 @@ if (requireNamespace("RSQLite")) { } ) - iris_merge <- iris[5:12,] + iris_merge <- iris2[5:12,] out_merge <- db_merge_into(iris_merge, "iris", irisdb) + out_merge2 <- db_merge_into(iris_merge, "iris", irisdb) iris_merge_nrow <- tbl(irisdb, "iris") %>% count() %>% pull(n) @@ -31,6 +36,14 @@ if (requireNamespace("RSQLite")) { "Only subset of iris data were merged", { expect_equal(collect(out_merge), iris[5:12,]) + expect_equal(collect(out_merge), collect(out_merge2)) + } + ) + + test_that( + "Extra column (not in SQL) was retained in `out_merge`", + { + expect_true("extracol" %in% colnames(out_merge)) } ) From fde7c43d69ff25eafa80d26750289b230217dfd3 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 15:23:03 -0600 Subject: [PATCH 10/18] DB: Add referenceID to try.sqlite --- base/db/R/try2sqlite.R | 11 +++-------- base/db/inst/import-try/01_try_sqlite.R | 8 ++++---- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R index ffbcb9fe0bc..595a823b4d1 100644 --- a/base/db/R/try2sqlite.R +++ b/base/db/R/try2sqlite.R @@ -20,11 +20,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { data.table::rbindlist() # Create integer reference ID for compact storage - data.table::set( - raw_data, - j = "ReferenceID", - value = as.integer(factor(raw_data[["Reference"]])) - ) + raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]])) # Create tables data_cols <- c( @@ -51,11 +47,10 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { "Dataset", "LastName", "FirstName", - "Reference" + "Reference", + "ReferenceID" ) datasets_values <- unique(raw_data[, datasets_cols, with = FALSE]) - # Add DOI column, to be filled later - datasets_values[["doi"]] <- NA_character_ traits_cols <- c( "DataID", diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R index 6dcd932b343..450f1cbbde1 100644 --- a/base/db/inst/import-try/01_try_sqlite.R +++ b/base/db/inst/import-try/01_try_sqlite.R @@ -1,5 +1,6 @@ # Create the TRY SQLite database library(PEcAn.DB) +library(PEcAn.logger) # Create the TRY SQLite database try_files <- "~/try-data/1829.txt" @@ -10,9 +11,8 @@ if (!exists("overwrite")) { } if (!file.exists(sqlite_file) || overwrite) { + file.remove(sqlite_file) try2sqlite(try_files, sqlite_file) -} - -if (!exists("force_add_doi")) { - force_add_doi <- FALSE +} else { + logger.info("TRY SQLite database already exists and `overwrite` is FALSE. ") } From 998d5b32e13704e43e8fa1d18aef4b0e9d24b577 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 1 Feb 2018 15:50:11 -0600 Subject: [PATCH 11/18] DB: Check column classes in `insert_table` --- base/db/R/insert_table.R | 39 +++++++++++++++++++++++++++++++++---- base/db/man/insert_table.Rd | 11 ++++++++--- 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R index ca0f9593304..b21b799f904 100644 --- a/base/db/R/insert_table.R +++ b/base/db/R/insert_table.R @@ -1,10 +1,14 @@ #' Insert R data frame into SQL database #' -#' First, subset to matching columns. Then, build an SQL string for the insert -#' statement. Finally, insert into the database. +#' First, subset to matching columns. Then, make sure the local and SQL column +#' classes match, coercing local to SQL as necessary (or throwing an error). +#' Then, build an SQL string for the insert statement. Finally, insert into the +#' database. #' #' @param values `data.frame` of values to write to SQL database #' @param table Name of target SQL table, as character +#' @param coerce_col_class logical, whether or not to coerce local data columns +#' to SQL classes. Default = `TRUE.` #' @inheritParams db.query #' @inherit db.query return #' @export @@ -15,7 +19,7 @@ #' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) #' insert_table(iris[-1,], "iris", irisdb$con) #' tbl(irisdb, "iris") -insert_table <- function(values, table, con) { +insert_table <- function(values, table, con, coerce_col_class = TRUE) { use_cols <- match_dbcols(values, table, con) if (length(use_cols) < 1) { PEcAn.logger::logger.severe( @@ -27,7 +31,34 @@ insert_table <- function(values, table, con) { paste(use_cols, collapse = ", ") ) values_sub <- values[, use_cols] - insert_query <- build_insert_query(values_sub, table, con = con) + # Load one row to get column types + sql_row <- dplyr::tbl(con, table) %>% head(1) %>% collect() + sql_types <- purrr::map(sql_row, class) %>% + purrr::map_chr(1) %>% + .[use_cols] + values_types <- purrr::map(values_sub, class) %>% purrr::map_chr(1) + type_mismatch <- sql_types != values_types + if (sum(type_mismatch) > 0) { + mismatch_string <- sprintf( + "%s: local is %s, SQL is %s", + names(values_types), + values_types, + sql_types + )[type_mismatch] + PEcAn.logger::logger.info( + "Found type mismatches in the following columns: ", + paste0(mismatch_string, collapse = "; ") + ) + if (!coerce_col_class) { + PEcAn.logger::logger.severe( + "Type mismatch detected, and `coerce_col_class` is `FALSE`. ", + "Fix column class mismatches manually." + ) + } + } + # Coerce values data frame to these types + values_fixed <- purrr::map2_dfc(values_sub, sql_types, as) + insert_query <- build_insert_query(values_fixed, table, con = con) db.query(insert_query, con) } diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd index 01e392b842a..54c7f3d7320 100644 --- a/base/db/man/insert_table.Rd +++ b/base/db/man/insert_table.Rd @@ -4,7 +4,7 @@ \alias{insert_table} \title{Insert R data frame into SQL database} \usage{ -insert_table(values, table, con) +insert_table(values, table, con, coerce_col_class = TRUE) } \arguments{ \item{values}{`data.frame` of values to write to SQL database} @@ -12,13 +12,18 @@ insert_table(values, table, con) \item{table}{Name of target SQL table, as character} \item{con}{database connection object} + +\item{coerce_col_class}{logical, whether or not to coerce local data columns +to SQL classes. Default = `TRUE.`} } \value{ data frame with query results } \description{ -First, subset to matching columns. Then, build an SQL string for the insert -statement. Finally, insert into the database. +First, subset to matching columns. Then, make sure the local and SQL column +classes match, coercing local to SQL as necessary (or throwing an error). +Then, build an SQL string for the insert statement. Finally, insert into the +database. } \examples{ library(dplyr) From 1e9515ba588eb49770101ccf9f09d0d8e853ca64 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Fri, 2 Feb 2018 08:57:39 -0600 Subject: [PATCH 12/18] DB: Miscellaneous bugfixes to last few commits --- base/db/NAMESPACE | 1 + base/db/R/db_merge_into.R | 7 +++--- base/db/R/insert_table.R | 31 ++++++++++++++++++++------ base/db/inst/import-try/02_citations.R | 13 ++++++----- base/db/man/match_colnames.Rd | 18 +++++++++++++++ base/db/man/match_dbcols.Rd | 12 +++++++--- 6 files changed, 64 insertions(+), 18 deletions(-) create mode 100644 base/db/man/match_colnames.Rd diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index a10100f4152..67a63071aa8 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -35,6 +35,7 @@ export(get_var_names) export(get_workflow_ids) export(insert_table) export(load_data_single_run) +export(match_dbcols) export(ncdays2date) export(query.file.path) export(query.format.vars) diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R index a387769cf96..eac93e184ab 100644 --- a/base/db/R/db_merge_into.R +++ b/base/db/R/db_merge_into.R @@ -13,11 +13,12 @@ #' db_merge_into(iris[1:12,], "iris", irisdb) #' tbl(irisdb, "iris") %>% count() db_merge_into <- function(values, table, con, by = NULL) { + values_fixed <- match_dbcols(values, table, con) if (is.null(by)) { - by <- match_dbcols(values, table, con) + by <- colnames(values_fixed) } sql_tbl <- dplyr::tbl(con, table) - values_merge <- dplyr::anti_join(values, sql_tbl, by = by, copy = TRUE) + values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE) if (nrow(values_merge) < 1 || ncol(values_merge) < 1) { PEcAn.logger::logger.warn( "Input table for merge is empty." @@ -26,5 +27,5 @@ db_merge_into <- function(values, table, con, by = NULL) { insert <- insert_table(values_merge, table, con) } dplyr::tbl(con, table) %>% - dplyr::inner_join(values, copy = TRUE) + dplyr::inner_join(values_fixed, copy = TRUE) } diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R index b21b799f904..8d66449aaec 100644 --- a/base/db/R/insert_table.R +++ b/base/db/R/insert_table.R @@ -20,7 +20,19 @@ #' insert_table(iris[-1,], "iris", irisdb$con) #' tbl(irisdb, "iris") insert_table <- function(values, table, con, coerce_col_class = TRUE) { - use_cols <- match_dbcols(values, table, con) + values_fixed <- match_dbcols(values, table, con, coerce_col_class) + insert_query <- build_insert_query(values_fixed, table, con = con) + print(insert_query) + db.query(insert_query, con) +} + +#' Match column names and classes between local and SQL table +#' +#' @inheritParams insert_table +#' @return `values` `data.frame` with column names and classes matched to SQL +#' @export +match_dbcols <- function(values, table, con, coerce_col_class = TRUE) { + use_cols <- match_colnames(values, table, con) if (length(use_cols) < 1) { PEcAn.logger::logger.severe( "No columns match between input and target table." @@ -54,18 +66,23 @@ insert_table <- function(values, table, con, coerce_col_class = TRUE) { "Type mismatch detected, and `coerce_col_class` is `FALSE`. ", "Fix column class mismatches manually." ) + } else { + PEcAn.logger::logger.info( + "Coercing local column types to match SQL." + ) + # Coerce values data frame to these types + values_fixed <- purrr::map2_dfc(values_sub, sql_types, as) } + } else { + values_fixed <- values_sub } - # Coerce values data frame to these types - values_fixed <- purrr::map2_dfc(values_sub, sql_types, as) - insert_query <- build_insert_query(values_fixed, table, con = con) - db.query(insert_query, con) + values_fixed } #' Match names of local data frame to SQL table #' #' @inheritParams insert_table -match_dbcols <- function(values, table, con) { +match_colnames <- function(values, table, con) { tbl_db <- dplyr::tbl(con, table) table_cols <- dplyr::tbl_vars(tbl_db) values_cols <- colnames(values) @@ -89,7 +106,7 @@ build_insert_query <- function(values, table, ...) { dbplyr::sql(" "), dbplyr::ident(table), dbplyr::sql(" "), - dbplyr::sql_vector(dbplyr::escape(colnames(values))), + dbplyr::sql_vector(ident(colnames(values)), collapse = ", "), dbplyr::sql(" "), dbplyr::sql("VALUES"), dbplyr::sql(" "), diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R index b831c40ddda..beb3e7c6b0b 100644 --- a/base/db/inst/import-try/02_citations.R +++ b/base/db/inst/import-try/02_citations.R @@ -11,13 +11,16 @@ logger.setLevel("DEBUG") sqlite_file <- "try.sqlite" trydb <- dbConnect(SQLite(), sqlite_file) -references <- trydb %>% - tbl("datasets") %>% - distinct(Reference) %>% - pull(Reference) +reference_dat <- tbl(trydb, "datasets") %>% + distinct(Reference, ReferenceID) %>% + collect() -try_refs <- search_references(references, min_score = 1) +refs_proc <- reference_dat %>% + mutate(cr_df = map(Reference, search_references, min_score = 40)) %>% + unnest() source("inst/import-try/bety_connect.R") +bety_refs <- db_merge_into(refs_proc, "citations", bety, "doi") %>% + collect() # TODO: Use object `bety`, table `citations` diff --git a/base/db/man/match_colnames.Rd b/base/db/man/match_colnames.Rd new file mode 100644 index 00000000000..38450a89bfa --- /dev/null +++ b/base/db/man/match_colnames.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/insert_table.R +\name{match_colnames} +\alias{match_colnames} +\title{Match names of local data frame to SQL table} +\usage{ +match_colnames(values, table, con) +} +\arguments{ +\item{values}{`data.frame` of values to write to SQL database} + +\item{table}{Name of target SQL table, as character} + +\item{con}{database connection object} +} +\description{ +Match names of local data frame to SQL table +} diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd index d70b74717ae..490e5f6b58a 100644 --- a/base/db/man/match_dbcols.Rd +++ b/base/db/man/match_dbcols.Rd @@ -2,9 +2,9 @@ % Please edit documentation in R/insert_table.R \name{match_dbcols} \alias{match_dbcols} -\title{Match names of local data frame to SQL table} +\title{Match column names and classes between local and SQL table} \usage{ -match_dbcols(values, table, con) +match_dbcols(values, table, con, coerce_col_class = TRUE) } \arguments{ \item{values}{`data.frame` of values to write to SQL database} @@ -12,7 +12,13 @@ match_dbcols(values, table, con) \item{table}{Name of target SQL table, as character} \item{con}{database connection object} + +\item{coerce_col_class}{logical, whether or not to coerce local data columns +to SQL classes. Default = `TRUE.`} +} +\value{ +`values` `data.frame` with column names and classes matched to SQL } \description{ -Match names of local data frame to SQL table +Match column names and classes between local and SQL table } From 0f3b9c75b8475195129f240c23ed660b150386ac Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Fri, 2 Feb 2018 10:55:37 -0500 Subject: [PATCH 13/18] DB: Add more logging to try2sqlite When the data are huge, it's nice to get more regular status updates. --- base/db/R/try2sqlite.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R index 595a823b4d1..a95f96aa08c 100644 --- a/base/db/R/try2sqlite.R +++ b/base/db/R/try2sqlite.R @@ -16,13 +16,16 @@ #' @export try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { # Read files + PEcAn.logger::logger.info("Reading in TRY data...") raw_data <- Map(data.table::fread, try_files) %>% data.table::rbindlist() # Create integer reference ID for compact storage + PEcAn.logger::logger.info("Adding ReferenceID column") raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]])) # Create tables + PEcAn.logger::logger.info("Extracting data values table.") data_cols <- c( "ObsDataID", # TRY row ID -- unique to each observation of a given trait "ObservationID", # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf) @@ -42,6 +45,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { ) data_values <- unique(raw_data[, data_cols, with = FALSE]) + PEcAn.logger::logger.info("Extrating datasets table...") datasets_cols <- c( "DatasetID", "Dataset", @@ -52,6 +56,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { ) datasets_values <- unique(raw_data[, datasets_cols, with = FALSE]) + PEcAn.logger::logger.info("Extracting traits table...") traits_cols <- c( "DataID", "DataName", @@ -60,6 +65,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { ) traits_values <- unique(raw_data[, traits_cols, with = FALSE]) + PEcAn.logger::logger.info("Extracting species table...") species_cols <- c( "AccSpeciesID", "AccSpeciesName", @@ -67,12 +73,19 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { ) species_values <- unique(raw_data[, species_cols, with = FALSE]) + PEcAn.logger::logger.info("Writing tables to SQLite database...") con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) on.exit(DBI::dbDisconnect(con)) + PEcAn.logger::logger.info("Writing values table...") DBI::dbWriteTable(con, "values", data_values) + PEcAn.logger::logger.info("Writing traits table...") DBI::dbWriteTable(con, "traits", data_values) + PEcAn.logger::logger.info("Writing datasets table...") DBI::dbWriteTable(con, "datasets", datasets_values) + PEcAn.logger::logger.info("Writing species table...") DBI::dbWriteTable(con, "species", species_values) + PEcAn.logger::logger.info("Done creating TRY SQLite database!") + NULL } From 90ac2ba0ff0beb8724d5963fabf6b6b7f097da5c Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Fri, 2 Feb 2018 10:57:52 -0500 Subject: [PATCH 14/18] DB: Improve directory management in TRY workflow Use the `here` package to always work relative to `db` package directory. Also, set paths and connections in a `config.R` script. --- base/db/.here | 0 base/db/inst/import-try/01_try_sqlite.R | 6 +++--- base/db/inst/import-try/config.R | 14 ++++++++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) create mode 100644 base/db/.here create mode 100644 base/db/inst/import-try/config.R diff --git a/base/db/.here b/base/db/.here new file mode 100644 index 00000000000..e69de29bb2d diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R index 450f1cbbde1..9480ea8687e 100644 --- a/base/db/inst/import-try/01_try_sqlite.R +++ b/base/db/inst/import-try/01_try_sqlite.R @@ -1,10 +1,10 @@ # Create the TRY SQLite database library(PEcAn.DB) library(PEcAn.logger) +library(here) -# Create the TRY SQLite database -try_files <- "~/try-data/1829.txt" -sqlite_file <- "try.sqlite" +configfile <- here("inst", "import-try", "config.R") +source(configfile) if (!exists("overwrite")) { overwrite <- FALSE diff --git a/base/db/inst/import-try/config.R b/base/db/inst/import-try/config.R new file mode 100644 index 00000000000..eff2c8ce7a1 --- /dev/null +++ b/base/db/inst/import-try/config.R @@ -0,0 +1,14 @@ +# Character vector containing TRY file paths +try_files <- c("~/Projects/try-data/4143.txt") +stopifnot(all(file.exists(try_files))) + +# Path to generated TRY SQLite file +sqlite_file <- "inst/import-try/try.sqlite" + +# Bety connection configuration +betyparams <- list( + host = "localhost", + dbname = "bety", + user = "bety", + password = "bety" +) From 951012399032ba4f2e31e33f8192223f3b52e40d Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Fri, 2 Feb 2018 12:26:23 -0600 Subject: [PATCH 15/18] DB: Clean up TRY citations script --- base/db/DESCRIPTION | 3 +- base/db/inst/import-try/02_citations.R | 52 +++++++++++++++++++++----- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index 8dabca46856..654a80817ee 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -29,7 +29,8 @@ Suggests: testthat (>= 1.0.2), tidyverse, data.table, - rcrossref + rcrossref, + here License: FreeBSD + file LICENSE Copyright: Authors LazyLoad: yes diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R index beb3e7c6b0b..453d65dd007 100644 --- a/base/db/inst/import-try/02_citations.R +++ b/base/db/inst/import-try/02_citations.R @@ -5,22 +5,56 @@ library(RSQLite) library(rcrossref) library(PEcAn.logger) library(PEcAn.DB) +library(here) -logger.setLevel("DEBUG") +wd <- here("inst", "import-try") +configfile <- file.path(wd, "config.R") +source(configfile) + +data_dir <- file.path(wd, "data-proc") +dir.create(data_dir, showWarnings = FALSE) -sqlite_file <- "try.sqlite" +logger.setLevel("DEBUG") trydb <- dbConnect(SQLite(), sqlite_file) reference_dat <- tbl(trydb, "datasets") %>% distinct(Reference, ReferenceID) %>% collect() -refs_proc <- reference_dat %>% - mutate(cr_df = map(Reference, search_references, min_score = 40)) %>% - unnest() +refs_proc_file <- file.path(data_dir, "refs_proc.rds") +if (file.exists(refs_proc_file)) { + refs_proc <- readRDS(refs_proc_file) +} else { + refs_proc <- reference_dat %>% + mutate(cr_df = map(Reference, search_references, min_score = 40)) %>% + unnest() + saveRDS(refs_proc, refs_proc_file) +} -source("inst/import-try/bety_connect.R") -bety_refs <- db_merge_into(refs_proc, "citations", bety, "doi") %>% - collect() +# Replace bad matches with NA +minscore <- 85 +fill_na <- function(field, score) { + na <- as(NA, class(field)) + if_else(score > minscore, field, na) +} +refs_proc2 <- refs_proc %>% + mutate_at( + c("author", "year", "journal", "vol", "pg", "doi"), + fill_na, + score = .$score + ) %>% + mutate( + title = if_else(score > minscore, title, Reference) + ) %>% + # TODO: Hack until Bety author column is extended + mutate_if(is.character, substr, start = 0, stop = 254) %>% + # TODO: Hack until Bety constraints on these columns are relaxed + select(-pg) -# TODO: Use object `bety`, table `citations` + +# Check character column length + +bety <- db.open(betyparams) + +bety_refs <- db_merge_into(refs_proc2, "citations", bety) %>% + collect() From 717c47711a6be9e5282451e22aeb45a9640418e4 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Wed, 7 Feb 2018 15:58:48 -0500 Subject: [PATCH 16/18] DB: Working TRY citations import --- base/db/.gitignore | 1 + base/db/R/search_references.R | 9 +++--- base/db/R/zz.imports.R | 7 +++-- base/db/inst/import-try/02_citations.R | 39 +++++++++++++++++--------- 4 files changed, 35 insertions(+), 21 deletions(-) diff --git a/base/db/.gitignore b/base/db/.gitignore index bcf2011eddd..c1e2f2a3c7b 100644 --- a/base/db/.gitignore +++ b/base/db/.gitignore @@ -1 +1,2 @@ try.sqlite +inst/import-try/data-proc diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R index 86f5e4150f2..14b61adc7bb 100644 --- a/base/db/R/search_references.R +++ b/base/db/R/search_references.R @@ -53,14 +53,15 @@ search_reference_single <- function(query, limit = 1, min_score = 85) { ) proc_search <- crdata %>% dplyr::mutate( - author = purrr::map_chr( - author, - ~paste(.[["given"]], .[["family"]], sep = " ", collapse = ", ")), + # Get the first author only -- this is the BETY format + author_family = purrr::map(author, list("family", 1)), + author_given = purrr::map(author, list("given", 1)), + author = paste(author_family, author_given, sep = ", "), year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(), query = query, score = as.numeric(score) ) use_cols <- keep_cols[keep_cols %in% colnames(proc_search)] - select(proc_search, !!!use_cols) + dplyr::select(proc_search, !!!use_cols) } diff --git a/base/db/R/zz.imports.R b/base/db/R/zz.imports.R index 050b5a37cf9..fbdf9b818aa 100644 --- a/base/db/R/zz.imports.R +++ b/base/db/R/zz.imports.R @@ -1,3 +1,4 @@ -##' Imports from other packages -##' -##' @importFrom magrittr `%>%` +#' Imports from other packages +#' +#' @importFrom magrittr `%>%` +#' @importFrom rlang `!!` `!!!` diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R index 453d65dd007..98c1dc56758 100644 --- a/base/db/inst/import-try/02_citations.R +++ b/base/db/inst/import-try/02_citations.R @@ -14,9 +14,18 @@ source(configfile) data_dir <- file.path(wd, "data-proc") dir.create(data_dir, showWarnings = FALSE) -logger.setLevel("DEBUG") +bety <- db.open(betyparams) +if (!"notes" %in% dbListFields(bety, "citations")) { + logger.severe( + "`notes` column required in Bety citations table ", + "but not found in this version of Bety. ", + "Please make sure you have performed schema migration ", + "version 20180206152600 (relax_citations)." + ) +} trydb <- dbConnect(SQLite(), sqlite_file) + reference_dat <- tbl(trydb, "datasets") %>% distinct(Reference, ReferenceID) %>% collect() @@ -25,9 +34,11 @@ refs_proc_file <- file.path(data_dir, "refs_proc.rds") if (file.exists(refs_proc_file)) { refs_proc <- readRDS(refs_proc_file) } else { + logger.setLevel("DEBUG") # To get status messages refs_proc <- reference_dat %>% mutate(cr_df = map(Reference, search_references, min_score = 40)) %>% unnest() + logger.setLevel("INFO") saveRDS(refs_proc, refs_proc_file) } @@ -39,22 +50,22 @@ fill_na <- function(field, score) { } refs_proc2 <- refs_proc %>% mutate_at( - c("author", "year", "journal", "vol", "pg", "doi"), + c("title", "author", "year", "journal", "vol", "pg", "doi"), fill_na, score = .$score ) %>% mutate( - title = if_else(score > minscore, title, Reference) - ) %>% - # TODO: Hack until Bety author column is extended - mutate_if(is.character, substr, start = 0, stop = 254) %>% - # TODO: Hack until Bety constraints on these columns are relaxed - select(-pg) - - -# Check character column length - -bety <- db.open(betyparams) + title = if_else(!is.na(title), title, paste0("TRY ReferenceID ", ReferenceID)), + author = if_else(!is.na(author), author, "Unknown TRY data (see title)"), + author = substr(author, 0, 254), # Trim author to 255 characters + journal = if_else(!is.na(journal), journal, "Unknown TRY data (see title)"), + # Use the Kattge 2007 TRY paper's DOI as a placeholder + doi = if_else(!is.na(doi), doi, "10.1111/j.1365-2486.2011.02451.x"), + year = if_else(!is.na(year), year, 2018), + pg = if_else(!is.na(pg), pg, "9999"), + notes = paste("Original TRY reference: ", Reference) + ) -bety_refs <- db_merge_into(refs_proc2, "citations", bety) %>% +bety_refs <- db_merge_into(refs_proc2, "citations", bety, "notes") %>% collect() +saveRDS(bety_refs, file.path(data_dir, "refs_bety.rds")) From 2e889034d5b930e2e199bcf892f82529979466c7 Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Wed, 7 Feb 2018 16:22:19 -0500 Subject: [PATCH 17/18] DB: Fix try2sqlite bug --- base/db/R/try2sqlite.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R index a95f96aa08c..828afd00957 100644 --- a/base/db/R/try2sqlite.R +++ b/base/db/R/try2sqlite.R @@ -79,7 +79,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { PEcAn.logger::logger.info("Writing values table...") DBI::dbWriteTable(con, "values", data_values) PEcAn.logger::logger.info("Writing traits table...") - DBI::dbWriteTable(con, "traits", data_values) + DBI::dbWriteTable(con, "traits", traits_values) PEcAn.logger::logger.info("Writing datasets table...") DBI::dbWriteTable(con, "datasets", datasets_values) PEcAn.logger::logger.info("Writing species table...") From 433c96e46a73a32965296f81108595fdc079d4bc Mon Sep 17 00:00:00 2001 From: Alexey Shiklomanov Date: Thu, 8 Feb 2018 12:04:06 -0500 Subject: [PATCH 18/18] DB: Address @infotroph PR #1848 comments Notable changes: - Use `glue::glue_sql` instead of `dbplyr::build_sql` for generating SQL query. The syntax is much cleaner. This also fixes the breaking tests by improving the way column names are quoted. - Remove the `.here` file - In tests and examples, use `":memory:"` SQLite database instead of a temporary file - Use explicit `package::function` syntax in tests, to test for NAMESPACE issues --- base/db/.here | 0 base/db/DESCRIPTION | 4 ++ base/db/NAMESPACE | 1 + base/db/R/db_merge_into.R | 18 +++--- base/db/R/insert_table.R | 45 +++++++------- base/db/R/zz.imports.R | 9 ++- base/db/man/build_insert_query.Rd | 4 +- base/db/man/db_merge_into.Rd | 23 +++++--- base/db/man/insert_table.Rd | 12 ++-- base/db/man/match_dbcols.Rd | 4 +- base/db/man/otherimports.Rd | 19 ++++++ base/db/tests/testthat/test.insert.R | 88 +++++++++++++--------------- 12 files changed, 129 insertions(+), 98 deletions(-) delete mode 100644 base/db/.here create mode 100644 base/db/man/otherimports.Rd diff --git a/base/db/.here b/base/db/.here deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION index 654a80817ee..82366762c69 100644 --- a/base/db/DESCRIPTION +++ b/base/db/DESCRIPTION @@ -18,6 +18,10 @@ Imports: PEcAn.utils, dbplyr (>= 1.2.0), dplyr, + tibble, + purrr, + tidyr, + glue, lubridate, magrittr, ncdf4, diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE index 67a63071aa8..01bcbcd748e 100644 --- a/base/db/NAMESPACE +++ b/base/db/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export("%>%") export(append.covariate) export(assign.treatments) export(bety2pecan) diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R index eac93e184ab..a00718e5ac3 100644 --- a/base/db/R/db_merge_into.R +++ b/base/db/R/db_merge_into.R @@ -1,21 +1,19 @@ #' Merge local data frame into SQL table #' #' @inheritParams insert_table +#' @inheritDotParams insert_table #' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values` #' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) #' @export #' @examples -#' library(dplyr) -#' library(RSQLite) -#' irisfile <- tempfile(fileext = ".sqlite") -#' irisdb <- dbConnect(SQLite(), irisfile) -#' copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) +#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +#' dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) #' db_merge_into(iris[1:12,], "iris", irisdb) -#' tbl(irisdb, "iris") %>% count() -db_merge_into <- function(values, table, con, by = NULL) { - values_fixed <- match_dbcols(values, table, con) +#' dplyr::tbl(irisdb, "iris") %>% dplyr::count() +db_merge_into <- function(values, table, con, by = NULL, drop = FALSE, ...) { + values_fixed <- match_dbcols(values, table, con, drop = FALSE) if (is.null(by)) { - by <- colnames(values_fixed) + by <- match_colnames(values, table, con) } sql_tbl <- dplyr::tbl(con, table) values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE) @@ -24,7 +22,7 @@ db_merge_into <- function(values, table, con, by = NULL) { "Input table for merge is empty." ) } else { - insert <- insert_table(values_merge, table, con) + insert <- insert_table(values_merge, table, con, ...) } dplyr::tbl(con, table) %>% dplyr::inner_join(values_fixed, copy = TRUE) diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R index 8d66449aaec..f9e45c488bf 100644 --- a/base/db/R/insert_table.R +++ b/base/db/R/insert_table.R @@ -9,20 +9,18 @@ #' @param table Name of target SQL table, as character #' @param coerce_col_class logical, whether or not to coerce local data columns #' to SQL classes. Default = `TRUE.` +#' @param drop logical. If `TRUE` (default), drop columns not found in SQL table. #' @inheritParams db.query #' @inherit db.query return #' @export #' @examples -#' library(dplyr) -#' irisfile <- tempfile(fileext = ".sqlite") -#' irisdb <- src_sqlite(irisfile, create = TRUE) -#' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) +#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +#' dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) #' insert_table(iris[-1,], "iris", irisdb$con) -#' tbl(irisdb, "iris") -insert_table <- function(values, table, con, coerce_col_class = TRUE) { - values_fixed <- match_dbcols(values, table, con, coerce_col_class) - insert_query <- build_insert_query(values_fixed, table, con = con) - print(insert_query) +#' dplyr::tbl(irisdb, "iris") +insert_table <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) { + values_fixed <- match_dbcols(values, table, con, coerce_col_class, drop = TRUE) + insert_query <- build_insert_query(values_fixed, table, .con = con) db.query(insert_query, con) } @@ -31,7 +29,7 @@ insert_table <- function(values, table, con, coerce_col_class = TRUE) { #' @inheritParams insert_table #' @return `values` `data.frame` with column names and classes matched to SQL #' @export -match_dbcols <- function(values, table, con, coerce_col_class = TRUE) { +match_dbcols <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) { use_cols <- match_colnames(values, table, con) if (length(use_cols) < 1) { PEcAn.logger::logger.severe( @@ -44,7 +42,7 @@ match_dbcols <- function(values, table, con, coerce_col_class = TRUE) { ) values_sub <- values[, use_cols] # Load one row to get column types - sql_row <- dplyr::tbl(con, table) %>% head(1) %>% collect() + sql_row <- dplyr::tbl(con, table) %>% head(1) %>% dplyr::collect() sql_types <- purrr::map(sql_row, class) %>% purrr::map_chr(1) %>% .[use_cols] @@ -76,7 +74,12 @@ match_dbcols <- function(values, table, con, coerce_col_class = TRUE) { } else { values_fixed <- values_sub } - values_fixed + if (drop) { + values_fixed + } else { + drop_cols <- colnames(values)[!colnames(values) %in% use_cols] + dplyr::bind_cols(values_fixed, values[, drop_cols]) + } } #' Match names of local data frame to SQL table @@ -92,8 +95,8 @@ match_colnames <- function(values, table, con) { #' Build query to insert R data frame into SQL table #' #' @inheritParams insert_table -#' @param ... Additional arguments to [dbplyr::build_sql] -build_insert_query <- function(values, table, ...) { +#' @inheritParams glue::glue_sql +build_insert_query <- function(values, table, .con) { value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ])) insert_list <- value_list %>% @@ -101,15 +104,9 @@ build_insert_query <- function(values, table, ...) { purrr::map(dbplyr::escape) %>% purrr::map(dbplyr::sql_vector) - dbplyr::build_sql( - dbplyr::sql("INSERT INTO"), - dbplyr::sql(" "), - dbplyr::ident(table), - dbplyr::sql(" "), - dbplyr::sql_vector(ident(colnames(values)), collapse = ", "), - dbplyr::sql(" "), - dbplyr::sql("VALUES"), - dbplyr::sql(" "), - dbplyr::sql_vector(insert_list, parens = FALSE, collapse = ", "), ... + glue::glue_sql( + "INSERT INTO {`table`} ({`colnames(values)`*}) ", + "VALUES {insert_list*}", + .con = .con ) } diff --git a/base/db/R/zz.imports.R b/base/db/R/zz.imports.R index fbdf9b818aa..4eb0423a534 100644 --- a/base/db/R/zz.imports.R +++ b/base/db/R/zz.imports.R @@ -1,4 +1,9 @@ #' Imports from other packages #' -#' @importFrom magrittr `%>%` -#' @importFrom rlang `!!` `!!!` +#' @name otherimports +#' @importFrom magrittr %>% +#' @export +magrittr::`%>%` + +#' @rdname otherimports +#' @importFrom rlang !! !!! diff --git a/base/db/man/build_insert_query.Rd b/base/db/man/build_insert_query.Rd index a11bdc46c84..bbfb23fcfd4 100644 --- a/base/db/man/build_insert_query.Rd +++ b/base/db/man/build_insert_query.Rd @@ -4,14 +4,14 @@ \alias{build_insert_query} \title{Build query to insert R data frame into SQL table} \usage{ -build_insert_query(values, table, ...) +build_insert_query(values, table, .con) } \arguments{ \item{values}{`data.frame` of values to write to SQL database} \item{table}{Name of target SQL table, as character} -\item{...}{Additional arguments to [dbplyr::build_sql]} +\item{.con}{[\code{DBIConnection}]:A DBI connection object obtained from \code{DBI::dbConnect()}.} } \description{ Build query to insert R data frame into SQL table diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd index 764dc5a92ad..63752d4b027 100644 --- a/base/db/man/db_merge_into.Rd +++ b/base/db/man/db_merge_into.Rd @@ -4,7 +4,7 @@ \alias{db_merge_into} \title{Merge local data frame into SQL table} \usage{ -db_merge_into(values, table, con, by = NULL) +db_merge_into(values, table, con, by = NULL, drop = FALSE, ...) } \arguments{ \item{values}{`data.frame` of values to write to SQL database} @@ -14,6 +14,18 @@ db_merge_into(values, table, con, by = NULL) \item{con}{database connection object} \item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`} + +\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} + +\item{...}{Arguments passed on to \code{insert_table} +\describe{ + \item{values}{`data.frame` of values to write to SQL database} + \item{table}{Name of target SQL table, as character} + \item{coerce_col_class}{logical, whether or not to coerce local data columns +to SQL classes. Default = `TRUE.`} + \item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} + \item{con}{database connection object} +}} } \value{ Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) @@ -22,11 +34,8 @@ Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy q Merge local data frame into SQL table } \examples{ -library(dplyr) -library(RSQLite) -irisfile <- tempfile(fileext = ".sqlite") -irisdb <- dbConnect(SQLite(), irisfile) -copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) +irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) db_merge_into(iris[1:12,], "iris", irisdb) -tbl(irisdb, "iris") \%>\% count() +dplyr::tbl(irisdb, "iris") \%>\% dplyr::count() } diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd index 54c7f3d7320..c51740868ce 100644 --- a/base/db/man/insert_table.Rd +++ b/base/db/man/insert_table.Rd @@ -4,7 +4,7 @@ \alias{insert_table} \title{Insert R data frame into SQL database} \usage{ -insert_table(values, table, con, coerce_col_class = TRUE) +insert_table(values, table, con, coerce_col_class = TRUE, drop = TRUE) } \arguments{ \item{values}{`data.frame` of values to write to SQL database} @@ -15,6 +15,8 @@ insert_table(values, table, con, coerce_col_class = TRUE) \item{coerce_col_class}{logical, whether or not to coerce local data columns to SQL classes. Default = `TRUE.`} + +\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} } \value{ data frame with query results @@ -26,10 +28,8 @@ Then, build an SQL string for the insert statement. Finally, insert into the database. } \examples{ -library(dplyr) -irisfile <- tempfile(fileext = ".sqlite") -irisdb <- src_sqlite(irisfile, create = TRUE) -copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) +irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") +dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) insert_table(iris[-1,], "iris", irisdb$con) -tbl(irisdb, "iris") +dplyr::tbl(irisdb, "iris") } diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd index 490e5f6b58a..ba0b6ae8d00 100644 --- a/base/db/man/match_dbcols.Rd +++ b/base/db/man/match_dbcols.Rd @@ -4,7 +4,7 @@ \alias{match_dbcols} \title{Match column names and classes between local and SQL table} \usage{ -match_dbcols(values, table, con, coerce_col_class = TRUE) +match_dbcols(values, table, con, coerce_col_class = TRUE, drop = TRUE) } \arguments{ \item{values}{`data.frame` of values to write to SQL database} @@ -15,6 +15,8 @@ match_dbcols(values, table, con, coerce_col_class = TRUE) \item{coerce_col_class}{logical, whether or not to coerce local data columns to SQL classes. Default = `TRUE.`} + +\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.} } \value{ `values` `data.frame` with column names and classes matched to SQL diff --git a/base/db/man/otherimports.Rd b/base/db/man/otherimports.Rd new file mode 100644 index 00000000000..dd79f7fbbc3 --- /dev/null +++ b/base/db/man/otherimports.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/zz.imports.R +\docType{import} +\name{otherimports} +\alias{otherimports} +\alias{\%>\%} +\title{Imports from other packages} +\description{ +Imports from other packages +} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{magrittr}{\code{\link[magrittr]{\%>\%}}} +}} + diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R index 6df2179c156..95dd80a459e 100644 --- a/base/db/tests/testthat/test.insert.R +++ b/base/db/tests/testthat/test.insert.R @@ -1,53 +1,49 @@ library(PEcAn.DB) -library(DBI) library(testthat) -library(dplyr) context("SQL insertion helper functions") -if (requireNamespace("RSQLite")) { - library(RSQLite) - iris_file <- tempfile(fileext = ".sqlite") - irisdb <- dbConnect(SQLite(), iris_file) - iris <- as_tibble(iris) %>% - mutate( - Species = as.character(Species) +test_that( + "RSQLite-dependent tests work", + { + skip_if_not_installed("RSQLite") + irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") + iris <- tibble::as_tibble(iris) %>% + dplyr::mutate( + Species = as.character(Species) + ) + dplyr::copy_to(irisdb, iris[1,], "iris", overwrite = TRUE) + # Add extra column to see if it's successfully ignored + iris2 <- dplyr::mutate(iris, extracol = seq_len(nrow(iris))) + iris_insert <- iris2[2:10,] + .insert <- insert_table(iris_insert, "iris", irisdb) + test_that( + "Subset of iris was inserted into database", + { + iris_insert_test <- dplyr::tbl(irisdb, "iris") %>% dplyr::collect() + expect_equal(iris[1:10,], iris_insert_test) + } ) - copy_to(irisdb, iris[1,], "iris", overwrite = TRUE) - # Add extra column to see if it's successfully ignored - iris2 <- mutate(iris, extracol = row_number()) - iris_insert <- iris2[2:10,] - .insert <- insert_table(iris_insert, "iris", irisdb) - iris_insert_test <- tbl(irisdb, "iris") %>% collect() - test_that( - "Subset of iris was inserted into database", - { - expect_equal(iris[1:10,], iris_insert_test) - } - ) - - iris_merge <- iris2[5:12,] - out_merge <- db_merge_into(iris_merge, "iris", irisdb) - out_merge2 <- db_merge_into(iris_merge, "iris", irisdb) - iris_merge_nrow <- tbl(irisdb, "iris") %>% - count() %>% - pull(n) - test_that( - "Only subset of iris data were merged", - { - expect_equal(collect(out_merge), iris[5:12,]) - expect_equal(collect(out_merge), collect(out_merge2)) - } - ) - - test_that( - "Extra column (not in SQL) was retained in `out_merge`", - { - expect_true("extracol" %in% colnames(out_merge)) - } - ) - -} else { - message("Skipping insert tests because `RSQLite` not installed.") -} + iris_merge <- iris2[5:12,] + out_merge <- db_merge_into(iris_merge, "iris", irisdb) %>% + dplyr::collect() + iris_merge_nrow <- dplyr::tbl(irisdb, "iris") %>% + dplyr::count() %>% + dplyr::pull(n) + test_that( + "Only subset of iris data were merged", + { + expect_equal(out_merge, iris2[5:12,]) + out_merge2 <- db_merge_into(iris_merge, "iris", irisdb) %>% + dplyr::collect() + expect_equal(out_merge, out_merge2) + } + ) + test_that( + "Extra column (not in SQL) was retained in `out_merge`", + { + expect_true("extracol" %in% colnames(out_merge)) + } + ) + })