From b8a97804df30b77e8dc89a898d6e210ddf8cd31d Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 08:58:53 -0500
Subject: [PATCH 01/18] TRY: Add `try2sqlite` function. Move old workflow.

---
 base/db/.Rbuildignore                         |  1 +
 base/db/.gitignore                            |  1 +
 base/db/DESCRIPTION                           |  4 +-
 base/db/NAMESPACE                             |  1 +
 base/db/R/try2sqlite.R                        | 80 +++++++++++++++++++
 base/db/inst/import-try/01_try_sqlite.R       | 27 +++++++
 ...{01.global.subset.R => 91.global.subset.R} |  0
 ...fic.subset.R => 92.data.specific.subset.R} |  0
 ...eate.try.sites.R => 93.create.try.sites.R} |  0
 ...{04.match.species.R => 94.match.species.R} |  0
 .../{05.citations.R => 95.citations.R}        |  0
 .../{06.load.data.R => 96.load.data.R}        |  0
 base/db/man/try2sqlite.Rd                     | 26 ++++++
 13 files changed, 139 insertions(+), 1 deletion(-)
 create mode 100644 base/db/.gitignore
 create mode 100644 base/db/R/try2sqlite.R
 create mode 100644 base/db/inst/import-try/01_try_sqlite.R
 rename base/db/inst/import-try/{01.global.subset.R => 91.global.subset.R} (100%)
 rename base/db/inst/import-try/{02.data.specific.subset.R => 92.data.specific.subset.R} (100%)
 rename base/db/inst/import-try/{03.create.try.sites.R => 93.create.try.sites.R} (100%)
 rename base/db/inst/import-try/{04.match.species.R => 94.match.species.R} (100%)
 rename base/db/inst/import-try/{05.citations.R => 95.citations.R} (100%)
 rename base/db/inst/import-try/{06.load.data.R => 96.load.data.R} (100%)
 create mode 100644 base/db/man/try2sqlite.Rd

diff --git a/base/db/.Rbuildignore b/base/db/.Rbuildignore
index 91114bf2f2b..149d2856797 100644
--- a/base/db/.Rbuildignore
+++ b/base/db/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+try.sqlite
diff --git a/base/db/.gitignore b/base/db/.gitignore
new file mode 100644
index 00000000000..bcf2011eddd
--- /dev/null
+++ b/base/db/.gitignore
@@ -0,0 +1 @@
+try.sqlite
diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
index 0e1d9183067..62cb1a48523 100644
--- a/base/db/DESCRIPTION
+++ b/base/db/DESCRIPTION
@@ -25,7 +25,9 @@ Imports:
     udunits2
 Suggests:
     RPostgreSQL,
-    testthat (>= 1.0.2)
+    RSQLite,
+    testthat (>= 1.0.2),
+    data.table
 License: FreeBSD + file LICENSE
 Copyright: Authors
 LazyLoad: yes
diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index 78717a7f0b4..d9d61ef7ff2 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -44,6 +44,7 @@ export(query.traits)
 export(rename_jags_columns)
 export(runs)
 export(take.samples)
+export(try2sqlite)
 export(var_names_all)
 export(workflow)
 export(workflows)
diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R
new file mode 100644
index 00000000000..633bb499a48
--- /dev/null
+++ b/base/db/R/try2sqlite.R
@@ -0,0 +1,80 @@
+#' Convert TRY text file to SQLite database
+#'
+#' The TRY file is huge and unnecessarily long, which makes it difficult to 
+#' work with. The resulting SQLite database is much smaller on disk, and can be 
+#' read much faster thanks to lazy evaluation.
+#'
+#' The resulting TRY SQLite database contains the following tables:
+#'  - `values` -- The actual TRY data. Links to all other tables through ID columns.
+#'  - `traits` -- 
+#'  - `datasets`
+#'  - `species`
+#'
+#' @param try_files Character vector of file names containing TRY data.  
+#' Multiple files will be `rbind`-ed together.
+#' @param sqlite_file Target SQLite database file name, as character.
+#' @export
+try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
+  # Read files
+  raw_data <- Map(data.table::fread, try_files) %>%
+    data.table::rbindlist()
+
+  # Create integer reference ID for compact storage
+  raw_data[, ReferenceID := as.integer(factor(Reference))]
+
+  # Create tables
+  data_cols <- c(
+    "ObsDataID",        # TRY row ID -- unique to each observation of a given trait
+    "ObservationID",    # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf)
+    "DataID",           # Links to data ID
+    "StdValue",         # Standardized, QA-QC'ed value
+    "UnitName",         # Standardized unit
+    "AccSpeciesID",     # Link to 'species' table
+    "DatasetID",        # Link to 'datasets' table.
+    "ReferenceID",      # Link to 'try_references' table.
+    "ValueKindName",    # Type of value, e.g. mean, min, max, etc.
+    "UncertaintyName",  # Kind of uncertainty
+    "Replicates",       # Number of replicates
+    "RelUncertaintyPercent",
+    "OrigValueStr",         # Original data, as character string (before QA/QC)
+    "OrigUnitStr",          # Original unit, as character string (before QA/QC)
+    "OrigUncertaintyStr"    # Original uncertainty, as character string (before QA/QC)
+  )
+  data_values <- unique(raw_data[, data_cols, with = FALSE])
+  data_values[, ]
+
+  datasets_cols <- c(
+    "DatasetID",
+    "Dataset",
+    "LastName",
+    "FirstName",
+    "Reference"
+  )
+  datasets_values <- unique(raw_data[, datasets_cols, with = FALSE])
+  datasets_values[, doi := character()]   # Add DOI column, to be filled later
+
+  traits_cols <- c(
+    "DataID",
+    "DataName",
+    "TraitID",
+    "TraitName"
+  )
+  traits_values <- unique(raw_data[, traits_cols, with = FALSE])
+
+  species_cols <- c(
+    "AccSpeciesID",
+    "AccSpeciesName",
+    "SpeciesName"
+  )
+  species_values <- unique(raw_data[, species_cols, with = FALSE])
+  species_values[, BetySpeciesID := integer()]   # Add BETY ID column, to populate later
+
+  con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
+  on.exit(DBI::dbDisconnect(con))
+  DBI::dbWriteTable(con, "values", data_values)
+  DBI::dbWriteTable(con, "traits", data_values)
+  DBI::dbWriteTable(con, "datasets", datasets_values)
+  DBI::dbWriteTable(con, "species", species_values)
+
+  NULL
+}
diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R
new file mode 100644
index 00000000000..4010e1112a9
--- /dev/null
+++ b/base/db/inst/import-try/01_try_sqlite.R
@@ -0,0 +1,27 @@
+# ---
+# title: Example TRY import workflow
+# author: Alexey Shiklomanov
+# ---
+
+# Create the TRY SQLite database
+try_files <- "~/Projects/try/try-data/1829.txt"
+sqlite_file <- "try.sqlite"
+
+if (!file.exists(sqlite_file)) {
+  try2sqlite(try_files, sqlite_file)
+}
+
+trydb <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
+bety <- DBI::dbConnect(
+  RPostgres::Postgres(),
+  dbname = "bety",
+  host = "test-pecan.bu.edu",
+  user = "bety",
+  password = "bety"
+)
+
+# Add citations
+
+# Add sites
+
+# Add data
diff --git a/base/db/inst/import-try/01.global.subset.R b/base/db/inst/import-try/91.global.subset.R
similarity index 100%
rename from base/db/inst/import-try/01.global.subset.R
rename to base/db/inst/import-try/91.global.subset.R
diff --git a/base/db/inst/import-try/02.data.specific.subset.R b/base/db/inst/import-try/92.data.specific.subset.R
similarity index 100%
rename from base/db/inst/import-try/02.data.specific.subset.R
rename to base/db/inst/import-try/92.data.specific.subset.R
diff --git a/base/db/inst/import-try/03.create.try.sites.R b/base/db/inst/import-try/93.create.try.sites.R
similarity index 100%
rename from base/db/inst/import-try/03.create.try.sites.R
rename to base/db/inst/import-try/93.create.try.sites.R
diff --git a/base/db/inst/import-try/04.match.species.R b/base/db/inst/import-try/94.match.species.R
similarity index 100%
rename from base/db/inst/import-try/04.match.species.R
rename to base/db/inst/import-try/94.match.species.R
diff --git a/base/db/inst/import-try/05.citations.R b/base/db/inst/import-try/95.citations.R
similarity index 100%
rename from base/db/inst/import-try/05.citations.R
rename to base/db/inst/import-try/95.citations.R
diff --git a/base/db/inst/import-try/06.load.data.R b/base/db/inst/import-try/96.load.data.R
similarity index 100%
rename from base/db/inst/import-try/06.load.data.R
rename to base/db/inst/import-try/96.load.data.R
diff --git a/base/db/man/try2sqlite.Rd b/base/db/man/try2sqlite.Rd
new file mode 100644
index 00000000000..65f963e8bf3
--- /dev/null
+++ b/base/db/man/try2sqlite.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/try2sqlite.R
+\name{try2sqlite}
+\alias{try2sqlite}
+\title{Convert TRY text file to SQLite database}
+\usage{
+try2sqlite(try_files, sqlite_file = "try.sqlite")
+}
+\arguments{
+\item{try_files}{Character vector of file names containing TRY data.
+Multiple files will be `rbind`-ed together.}
+
+\item{sqlite_file}{Target SQLite database file name, as character.}
+}
+\description{
+The TRY file is huge and unnecessarily long, which makes it difficult to 
+work with. The resulting SQLite database is much smaller on disk, and can be 
+read much faster thanks to lazy evaluation.
+}
+\details{
+The resulting TRY SQLite database contains the following tables:
+ - `values` -- The actual TRY data. Links to all other tables through ID columns.
+ - `traits` -- 
+ - `datasets`
+ - `species`
+}

From fe89c99bb0784489617b56e75e3b560a1b95dec0 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 11:33:42 -0600
Subject: [PATCH 02/18] DB: Fix `data.table` issues in try2sqlite

---
 base/db/.Rbuildignore                   |  2 +-
 base/db/R/try2sqlite.R                  | 19 ++++++++-------
 base/db/inst/import-try/01_try_sqlite.R | 31 +++++++++----------------
 base/db/man/try2sqlite.Rd               |  8 +++----
 4 files changed, 27 insertions(+), 33 deletions(-)

diff --git a/base/db/.Rbuildignore b/base/db/.Rbuildignore
index 149d2856797..02548d35d12 100644
--- a/base/db/.Rbuildignore
+++ b/base/db/.Rbuildignore
@@ -1,3 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
-try.sqlite
+try\.sqlite
diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R
index 633bb499a48..ffbcb9fe0bc 100644
--- a/base/db/R/try2sqlite.R
+++ b/base/db/R/try2sqlite.R
@@ -6,12 +6,12 @@
 #'
 #' The resulting TRY SQLite database contains the following tables:
 #'  - `values` -- The actual TRY data. Links to all other tables through ID columns.
-#'  - `traits` -- 
-#'  - `datasets`
-#'  - `species`
+#'  - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table.
+#'  - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`.
+#'  - `species` -- Species. Links to `values` through `AccSpeciesID`.
 #'
 #' @param try_files Character vector of file names containing TRY data.  
-#' Multiple files will be `rbind`-ed together.
+#' Multiple files are combined with `data.table::rbindlist`.
 #' @param sqlite_file Target SQLite database file name, as character.
 #' @export
 try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
@@ -20,7 +20,11 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
     data.table::rbindlist()
 
   # Create integer reference ID for compact storage
-  raw_data[, ReferenceID := as.integer(factor(Reference))]
+  data.table::set(
+    raw_data,
+    j = "ReferenceID",
+    value = as.integer(factor(raw_data[["Reference"]]))
+  )
 
   # Create tables
   data_cols <- c(
@@ -41,7 +45,6 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
     "OrigUncertaintyStr"    # Original uncertainty, as character string (before QA/QC)
   )
   data_values <- unique(raw_data[, data_cols, with = FALSE])
-  data_values[, ]
 
   datasets_cols <- c(
     "DatasetID",
@@ -51,7 +54,8 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
     "Reference"
   )
   datasets_values <- unique(raw_data[, datasets_cols, with = FALSE])
-  datasets_values[, doi := character()]   # Add DOI column, to be filled later
+  # Add DOI column, to be filled later
+  datasets_values[["doi"]] <- NA_character_
 
   traits_cols <- c(
     "DataID",
@@ -67,7 +71,6 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
     "SpeciesName"
   )
   species_values <- unique(raw_data[, species_cols, with = FALSE])
-  species_values[, BetySpeciesID := integer()]   # Add BETY ID column, to populate later
 
   con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
   on.exit(DBI::dbDisconnect(con))
diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R
index 4010e1112a9..6dcd932b343 100644
--- a/base/db/inst/import-try/01_try_sqlite.R
+++ b/base/db/inst/import-try/01_try_sqlite.R
@@ -1,27 +1,18 @@
-# ---
-# title: Example TRY import workflow
-# author: Alexey Shiklomanov
-# ---
+# Create the TRY SQLite database
+library(PEcAn.DB)
 
 # Create the TRY SQLite database
-try_files <- "~/Projects/try/try-data/1829.txt"
+try_files <- "~/try-data/1829.txt"
 sqlite_file <- "try.sqlite"
 
-if (!file.exists(sqlite_file)) {
-  try2sqlite(try_files, sqlite_file)
+if (!exists("overwrite")) {
+  overwrite <- FALSE
 }
 
-trydb <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
-bety <- DBI::dbConnect(
-  RPostgres::Postgres(),
-  dbname = "bety",
-  host = "test-pecan.bu.edu",
-  user = "bety",
-  password = "bety"
-)
-
-# Add citations
-
-# Add sites
+if (!file.exists(sqlite_file) || overwrite) {
+  try2sqlite(try_files, sqlite_file)
+}
 
-# Add data
+if (!exists("force_add_doi")) {
+  force_add_doi <- FALSE
+}
diff --git a/base/db/man/try2sqlite.Rd b/base/db/man/try2sqlite.Rd
index 65f963e8bf3..e4b6407712b 100644
--- a/base/db/man/try2sqlite.Rd
+++ b/base/db/man/try2sqlite.Rd
@@ -8,7 +8,7 @@ try2sqlite(try_files, sqlite_file = "try.sqlite")
 }
 \arguments{
 \item{try_files}{Character vector of file names containing TRY data.
-Multiple files will be `rbind`-ed together.}
+Multiple files are combined with `data.table::rbindlist`.}
 
 \item{sqlite_file}{Target SQLite database file name, as character.}
 }
@@ -20,7 +20,7 @@ read much faster thanks to lazy evaluation.
 \details{
 The resulting TRY SQLite database contains the following tables:
  - `values` -- The actual TRY data. Links to all other tables through ID columns.
- - `traits` -- 
- - `datasets`
- - `species`
+ - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table.
+ - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`.
+ - `species` -- Species. Links to `values` through `AccSpeciesID`.
 }

From e1128e3626c2c29169200eccba304c1e82d2a895 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 11:34:09 -0600
Subject: [PATCH 03/18] DB: Add `search_references` functions

---
 base/db/DESCRIPTION                    |  4 +-
 base/db/NAMESPACE                      |  1 +
 base/db/R/search_references.R          | 66 ++++++++++++++++++++++++++
 base/db/inst/import-try/02_citations.R | 19 ++++++++
 base/db/inst/import-try/README.md      |  7 ++-
 base/db/inst/import-try/bety_connect.R |  6 +++
 base/db/man/search_reference_single.Rd | 21 ++++++++
 base/db/man/search_references.Rd       | 24 ++++++++++
 8 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 base/db/R/search_references.R
 create mode 100644 base/db/inst/import-try/02_citations.R
 create mode 100644 base/db/inst/import-try/bety_connect.R
 create mode 100644 base/db/man/search_reference_single.Rd
 create mode 100644 base/db/man/search_references.Rd

diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
index 62cb1a48523..8dabca46856 100644
--- a/base/db/DESCRIPTION
+++ b/base/db/DESCRIPTION
@@ -27,7 +27,9 @@ Suggests:
     RPostgreSQL,
     RSQLite,
     testthat (>= 1.0.2),
-    data.table
+    tidyverse,
+    data.table,
+    rcrossref
 License: FreeBSD + file LICENSE
 Copyright: Authors
 LazyLoad: yes
diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index d9d61ef7ff2..b06517ac3c5 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -43,6 +43,7 @@ export(query.trait.data)
 export(query.traits)
 export(rename_jags_columns)
 export(runs)
+export(search_references)
 export(take.samples)
 export(try2sqlite)
 export(var_names_all)
diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R
new file mode 100644
index 00000000000..717fb47c5ba
--- /dev/null
+++ b/base/db/R/search_references.R
@@ -0,0 +1,66 @@
+#' Perform crossref search for a list of references
+#'
+#' @param queries Character vector of queries
+#' @inheritDotParams search_reference_single
+#' @inherit search_reference_single description return
+#' @export
+search_references <- function(queries, ...) {
+  search_fun <- search_reference_single %>%
+    purrr::partial(...) %>%
+    purrr::possibly(otherwise = data.frame(title = "Not found"))
+  encodeString(queries) %>%
+    purrr::map_dfr(search_fun)
+}
+
+#' Perform crossref search for a single reference
+#'
+#' Requires the `rcrossref` package.
+#'
+#' @param query Citation string (length 1) to search for DOI
+#' @param min_score Minimum match score. Default (85) is fairly strict.
+#' @param limit Number of results to return
+#' @return `data.frame` containing crossref information converted to match bety citations table.
+search_reference_single <- function(query, limit = 1, min_score = 85) {
+  stopifnot(length(query) == 1)
+  PEcAn.logger::logger.debug("Processing query:\n", query)
+  crsearch <- rcrossref::cr_works(query = query, limit = limit)
+  if (is.null(crsearch[["data"]])) {
+    PEcAn.logger::logger.warn(
+      "Error in crossref query. ",
+      "Setting title to search string and leaving other fields blank."
+    )
+    return(tibble::tibble(query = query))
+  }
+  crdata <- crsearch[["data"]] %>%
+    dplyr::mutate(score = as.numeric(score)) %>%
+    dplyr::filter(score > min_score)
+  if (nrow(crdata) < 1) {
+    PEcAn.logger::logger.info(
+      "No matches found. ",
+      "Setting title to search string and leaving other fields blank.")
+    return(tibble::tibble(query = query))
+  }
+  keep_cols <- c(
+    "author",
+    "year",
+    "title",
+    journal = "container.title",
+    vol = "volume",
+    pg = "page",
+    doi = "DOI",
+    "score",
+    "query"
+  )
+  proc_search <- crsearch %>%
+    dplyr::mutate(
+      author = purrr::map_chr(
+        author,
+        ~paste(.[["given"]], .[["family"]], sep = " ", collapse = ", ")),
+      year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(),
+      query = query,
+      score = as.numeric(score)
+    )
+  use_cols <- keep_cols[keep_cols %in% colnames(proc_search)]
+  select(proc_search, !!!use_cols)
+}
+
diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R
new file mode 100644
index 00000000000..5039c078c0b
--- /dev/null
+++ b/base/db/inst/import-try/02_citations.R
@@ -0,0 +1,19 @@
+# Add DOIs to TRY citations
+library(tidyverse)
+library(DBI)
+library(RSQLite)
+library(rcrossref)
+library(PEcAn.logger)
+
+sqlite_file <- "try.sqlite"
+
+trydb <- dbConnect(SQLite(), sqlite_file)
+references <- trydb %>%
+  tbl("datasets") %>%
+  distinct(Reference) %>%
+  pull(Reference)
+
+references <- readLines("references")[1:15]
+
+logger.setLevel("DEBUG")
+tidy_refs <- search_references(references)
diff --git a/base/db/inst/import-try/README.md b/base/db/inst/import-try/README.md
index 33ec4ade800..7ad5b00de76 100644
--- a/base/db/inst/import-try/README.md
+++ b/base/db/inst/import-try/README.md
@@ -1,8 +1,13 @@
 ---
-title: "Import TRY database into BETY""
+title: "Import TRY database into BETY"
 Author: "Alexey Shiklomanov"
 ---
 
+# Workflow
+
+1. `01_try_sqlite.R` -- Read TRY data files (huge tab-delimited text files) and convert them to an SQLite database, which is much faster and easier to work with.
+2. `02_citations.R` -- Add DOIs to TRY references, if possible.
+
 # Package dependencies
 1. `data.table` -- Makes it remotely possible to work with the TRY database. Requires an up-to-date version, so if parts of the workflow break, try re-installing this.
 2. `bit64` -- Used by `data.table` to read and store large integers, which constitude most of the ID's in TRY and BETY.
diff --git a/base/db/inst/import-try/bety_connect.R b/base/db/inst/import-try/bety_connect.R
new file mode 100644
index 00000000000..89d20b37f12
--- /dev/null
+++ b/base/db/inst/import-try/bety_connect.R
@@ -0,0 +1,6 @@
+bety <- PEcAn.DB::db.open(list(
+  host = "localhost",
+  dbname = "bety",
+  user = "bety",
+  password = "bety"
+))
diff --git a/base/db/man/search_reference_single.Rd b/base/db/man/search_reference_single.Rd
new file mode 100644
index 00000000000..c689ccdb86d
--- /dev/null
+++ b/base/db/man/search_reference_single.Rd
@@ -0,0 +1,21 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/search_references.R
+\name{search_reference_single}
+\alias{search_reference_single}
+\title{Perform crossref search for a single reference}
+\usage{
+search_reference_single(query, limit = 1, min_score = 85)
+}
+\arguments{
+\item{query}{Citation string (length 1) to search for DOI}
+
+\item{limit}{Number of results to return}
+
+\item{min_score}{Minimum match score. Default (85) is fairly strict.}
+}
+\value{
+`data.frame` containing crossref information converted to match bety citations table.
+}
+\description{
+Requires the `rcrossref` package.
+}
diff --git a/base/db/man/search_references.Rd b/base/db/man/search_references.Rd
new file mode 100644
index 00000000000..8af809bf7a1
--- /dev/null
+++ b/base/db/man/search_references.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/search_references.R
+\name{search_references}
+\alias{search_references}
+\title{Perform crossref search for a list of references}
+\usage{
+search_references(queries, ...)
+}
+\arguments{
+\item{queries}{Character vector of queries}
+
+\item{...}{Arguments passed on to \code{search_reference_single}
+\describe{
+  \item{query}{Citation string (length 1) to search for DOI}
+  \item{min_score}{Minimum match score. Default (85) is fairly strict.}
+  \item{limit}{Number of results to return}
+}}
+}
+\value{
+`data.frame` containing crossref information converted to match bety citations table.
+}
+\description{
+Perform crossref search for a list of references
+}

From dbad6dc0e9d5693da78bf821bef111caa6c0faa9 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 11:47:09 -0600
Subject: [PATCH 04/18] DB: Bugfix search_references

---
 base/db/R/search_references.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R
index 717fb47c5ba..86f5e4150f2 100644
--- a/base/db/R/search_references.R
+++ b/base/db/R/search_references.R
@@ -51,7 +51,7 @@ search_reference_single <- function(query, limit = 1, min_score = 85) {
     "score",
     "query"
   )
-  proc_search <- crsearch %>%
+  proc_search <- crdata %>%
     dplyr::mutate(
       author = purrr::map_chr(
         author,

From 004f0fef643df5a797aa3b12e3eaeca567e053bd Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 12:37:39 -0600
Subject: [PATCH 05/18] DB: Add `insert_table` function

---
 base/db/NAMESPACE                 |  1 +
 base/db/R/insert_table.R          | 67 +++++++++++++++++++++++++++++++
 base/db/man/build_insert_query.Rd | 18 +++++++++
 base/db/man/insert_table.Rd       | 30 ++++++++++++++
 base/db/man/match_dbcols.Rd       | 18 +++++++++
 5 files changed, 134 insertions(+)
 create mode 100644 base/db/R/insert_table.R
 create mode 100644 base/db/man/build_insert_query.Rd
 create mode 100644 base/db/man/insert_table.Rd
 create mode 100644 base/db/man/match_dbcols.Rd

diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index b06517ac3c5..2724becb5e9 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -32,6 +32,7 @@ export(get_run_ids)
 export(get_users)
 export(get_var_names)
 export(get_workflow_ids)
+export(insert_table)
 export(load_data_single_run)
 export(ncdays2date)
 export(query.file.path)
diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
new file mode 100644
index 00000000000..0a8b23fdc5b
--- /dev/null
+++ b/base/db/R/insert_table.R
@@ -0,0 +1,67 @@
+#' Insert R data frame into SQL database
+#'
+#' First, subset to matching columns. Then, build an SQL string for the insert 
+#' statement. Finally, insert into the database.
+#'
+#' @param values `data.frame` of values to write to SQL database
+#' @param table Name of target SQL table, as character
+#' @inheritParams db.query
+#' @inherit db.query return
+#' @export
+#' @examples
+#' library(dplyr)
+#' irisfile <- tempfile(fileext = ".sqlite")
+#' irisdb <- src_sqlite(irisfile, create = TRUE)
+#' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
+#' insert_table(iris[-1,], "iris", irisdb)
+#' tbl(irisdb, "iris")
+insert_table <- function(values, table, con) {
+  use_cols <- match_dbcols(values, table, con)
+  if (length(use_cols) < 1) {
+    PEcAn.logger::logger.severe(
+      "No columns match between input and target table."
+    )
+  }
+  PEcAn.logger::logger.debug(
+    "Matched the following cols: ",
+    paste(use_cols, collapse = ", ")
+  )
+  values_sub <- values[, use_cols]
+  insert_query <- build_insert_query(values, table, con = con)
+  db.query(con, insert_query)
+}
+
+#' Match names of local data frame to SQL table
+#'
+#' @inheritParams insert_table
+match_dbcols <- function(values, table, con) {
+  tbl_db <- dplyr::tbl(con, table)
+  table_cols <- dplyr::tbl_vars(tbl_db)
+  values_cols <- colnames(values)
+  intersect(values_cols, table_cols)
+}
+
+#' Build query to insert R data frame into SQL table
+#'
+#' @inheritParams insert_table
+#' @param ... Additional arguments to [dbplyr::build_sql]
+build_insert_query <- function(values, table, ...) {
+  value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ]))
+
+  insert_list <- value_list %>%
+    purrr::map(unname) %>%
+    purrr::map(dbplyr::escape) %>%
+    purrr::map(dbplyr::sql_vector)
+
+  dbplyr::build_sql(
+    dbplyr::sql("INSERT INTO"),
+    dbplyr::sql(" "),
+    dbplyr::ident(table),
+    dbplyr::sql(" "),
+    dbplyr::sql_vector(dbplyr::escape(colnames(values))),
+    dbplyr::sql(" "),
+    dbplyr::sql("VALUES"),
+    dbplyr::sql(" "),
+    dbplyr::sql_vector(insert_list, parens = FALSE, collapse = ", "), ...
+  )
+}
diff --git a/base/db/man/build_insert_query.Rd b/base/db/man/build_insert_query.Rd
new file mode 100644
index 00000000000..a11bdc46c84
--- /dev/null
+++ b/base/db/man/build_insert_query.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/insert_table.R
+\name{build_insert_query}
+\alias{build_insert_query}
+\title{Build query to insert R data frame into SQL table}
+\usage{
+build_insert_query(values, table, ...)
+}
+\arguments{
+\item{values}{`data.frame` of values to write to SQL database}
+
+\item{table}{Name of target SQL table, as character}
+
+\item{...}{Additional arguments to [dbplyr::build_sql]}
+}
+\description{
+Build query to insert R data frame into SQL table
+}
diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd
new file mode 100644
index 00000000000..f63acf7cc70
--- /dev/null
+++ b/base/db/man/insert_table.Rd
@@ -0,0 +1,30 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/insert_table.R
+\name{insert_table}
+\alias{insert_table}
+\title{Insert R data frame into SQL database}
+\usage{
+insert_table(values, table, con)
+}
+\arguments{
+\item{values}{`data.frame` of values to write to SQL database}
+
+\item{table}{Name of target SQL table, as character}
+
+\item{con}{database connection object}
+}
+\value{
+data frame with query results
+}
+\description{
+First, subset to matching columns. Then, build an SQL string for the insert 
+statement. Finally, insert into the database.
+}
+\examples{
+library(dplyr)
+irisfile <- tempfile(fileext = ".sqlite")
+irisdb <- src_sqlite(irisfile, create = TRUE)
+copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
+insert_table(iris[-1,], "iris", irisdb)
+tbl(irisdb, "iris")
+}
diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd
new file mode 100644
index 00000000000..d70b74717ae
--- /dev/null
+++ b/base/db/man/match_dbcols.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/insert_table.R
+\name{match_dbcols}
+\alias{match_dbcols}
+\title{Match names of local data frame to SQL table}
+\usage{
+match_dbcols(values, table, con)
+}
+\arguments{
+\item{values}{`data.frame` of values to write to SQL database}
+
+\item{table}{Name of target SQL table, as character}
+
+\item{con}{database connection object}
+}
+\description{
+Match names of local data frame to SQL table
+}

From 1a0c15448a6e947812f517a3279647c64bbf79b9 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 13:01:07 -0600
Subject: [PATCH 06/18] DB: Add `db_merge_into` function.

Also, bugfix `insert_table`.
---
 base/db/R/db_merge_into.R    | 26 ++++++++++++++++++++++++++
 base/db/R/insert_table.R     |  4 ++--
 base/db/man/db_merge_into.Rd | 29 +++++++++++++++++++++++++++++
 base/db/man/insert_table.Rd  |  2 +-
 4 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 base/db/R/db_merge_into.R
 create mode 100644 base/db/man/db_merge_into.Rd

diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R
new file mode 100644
index 00000000000..64bb859b956
--- /dev/null
+++ b/base/db/R/db_merge_into.R
@@ -0,0 +1,26 @@
+#' Merge local data frame into SQL table
+#'
+#' @inheritParams insert_table
+#' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values`
+#' @examples
+#' library(dplyr)
+#' library(RSQLite)
+#' irisfile <- tempfile(fileext = ".sqlite")
+#' irisdb <- dbConnect(SQLite(), irisfile)
+#' copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
+#' db_merge_into(iris[1:12,], "iris", irisdb)
+#' tbl(irisdb, "iris") %>% count()
+db_merge_into <- function(values, table, con, by = NULL) {
+  if (is.null(by)) {
+    by <- match_dbcols(values, table, con)
+  }
+  sql_tbl <- dplyr::tbl(con, table)
+  values_merge <- dplyr::anti_join(values, sql_tbl, by = by, copy = TRUE)
+  if (nrow(values_merge) < 1 || ncol(values_merge) < 1) {
+    PEcAn.logger::logger.warn(
+      "Input table for merge is empty. Skipping operation and returning NULL."
+    )
+    return(NULL)
+  }
+  insert_table(values_merge, table, con)
+}
diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
index 0a8b23fdc5b..0d833526d23 100644
--- a/base/db/R/insert_table.R
+++ b/base/db/R/insert_table.R
@@ -13,7 +13,7 @@
 #' irisfile <- tempfile(fileext = ".sqlite")
 #' irisdb <- src_sqlite(irisfile, create = TRUE)
 #' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
-#' insert_table(iris[-1,], "iris", irisdb)
+#' insert_table(iris[-1,], "iris", irisdb$con)
 #' tbl(irisdb, "iris")
 insert_table <- function(values, table, con) {
   use_cols <- match_dbcols(values, table, con)
@@ -28,7 +28,7 @@ insert_table <- function(values, table, con) {
   )
   values_sub <- values[, use_cols]
   insert_query <- build_insert_query(values, table, con = con)
-  db.query(con, insert_query)
+  db.query(insert_query, con)
 }
 
 #' Match names of local data frame to SQL table
diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd
new file mode 100644
index 00000000000..ebd4c201ec6
--- /dev/null
+++ b/base/db/man/db_merge_into.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/db_merge_into.R
+\name{db_merge_into}
+\alias{db_merge_into}
+\title{Merge local data frame into SQL table}
+\usage{
+db_merge_into(values, table, con, by = NULL)
+}
+\arguments{
+\item{values}{`data.frame` of values to write to SQL database}
+
+\item{table}{Name of target SQL table, as character}
+
+\item{con}{database connection object}
+
+\item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`}
+}
+\description{
+Merge local data frame into SQL table
+}
+\examples{
+library(dplyr)
+library(RSQLite)
+irisfile <- tempfile(fileext = ".sqlite")
+irisdb <- dbConnect(SQLite(), irisfile)
+copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
+db_merge_into(iris[1:12,], "iris", irisdb)
+tbl(irisdb, "iris") \%>\% count()
+}
diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd
index f63acf7cc70..01e392b842a 100644
--- a/base/db/man/insert_table.Rd
+++ b/base/db/man/insert_table.Rd
@@ -25,6 +25,6 @@ library(dplyr)
 irisfile <- tempfile(fileext = ".sqlite")
 irisdb <- src_sqlite(irisfile, create = TRUE)
 copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
-insert_table(iris[-1,], "iris", irisdb)
+insert_table(iris[-1,], "iris", irisdb$con)
 tbl(irisdb, "iris")
 }

From cafbdda47addadc885d309041d7609afcf55152e Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 14:29:02 -0600
Subject: [PATCH 07/18] DB: Minor additions to citations script

---
 base/db/inst/import-try/02_citations.R | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R
index 5039c078c0b..b831c40ddda 100644
--- a/base/db/inst/import-try/02_citations.R
+++ b/base/db/inst/import-try/02_citations.R
@@ -4,6 +4,9 @@ library(DBI)
 library(RSQLite)
 library(rcrossref)
 library(PEcAn.logger)
+library(PEcAn.DB)
+
+logger.setLevel("DEBUG")
 
 sqlite_file <- "try.sqlite"
 
@@ -13,7 +16,8 @@ references <- trydb %>%
   distinct(Reference) %>%
   pull(Reference)
 
-references <- readLines("references")[1:15]
+try_refs <- search_references(references, min_score = 1)
 
-logger.setLevel("DEBUG")
-tidy_refs <- search_references(references)
+source("inst/import-try/bety_connect.R")
+
+# TODO: Use object `bety`, table `citations`

From d339e1a269bb54d0e9a4852420b3fc62c2118e89 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 15:07:22 -0600
Subject: [PATCH 08/18] DB: Make db_merge_into output table

This is to catch the automatic addition of IDs and other timestamps, for
instance by Postgres.

Also, add insert function unit tests.
---
 base/db/NAMESPACE                    |  1 +
 base/db/R/db_merge_into.R            |  6 ++++-
 base/db/man/db_merge_into.Rd         |  3 +++
 base/db/tests/testthat/test.insert.R | 40 ++++++++++++++++++++++++++++
 4 files changed, 49 insertions(+), 1 deletion(-)
 create mode 100644 base/db/tests/testthat/test.insert.R

diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index 2724becb5e9..a10100f4152 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -12,6 +12,7 @@ export(db.print.connections)
 export(db.query)
 export(db.showQueries)
 export(dbHostInfo)
+export(db_merge_into)
 export(dbfile.check)
 export(dbfile.file)
 export(dbfile.id)
diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R
index 64bb859b956..8f9d7dec52f 100644
--- a/base/db/R/db_merge_into.R
+++ b/base/db/R/db_merge_into.R
@@ -2,6 +2,8 @@
 #'
 #' @inheritParams insert_table
 #' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values`
+#' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table)
+#' @export
 #' @examples
 #' library(dplyr)
 #' library(RSQLite)
@@ -22,5 +24,7 @@ db_merge_into <- function(values, table, con, by = NULL) {
     )
     return(NULL)
   }
-  insert_table(values_merge, table, con)
+  insert <- insert_table(values_merge, table, con)
+  dplyr::tbl(con, table) %>%
+    dplyr::inner_join(values, copy = TRUE)
 }
diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd
index ebd4c201ec6..764dc5a92ad 100644
--- a/base/db/man/db_merge_into.Rd
+++ b/base/db/man/db_merge_into.Rd
@@ -15,6 +15,9 @@ db_merge_into(values, table, con, by = NULL)
 
 \item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`}
 }
+\value{
+Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table)
+}
 \description{
 Merge local data frame into SQL table
 }
diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R
new file mode 100644
index 00000000000..76509ccb282
--- /dev/null
+++ b/base/db/tests/testthat/test.insert.R
@@ -0,0 +1,40 @@
+library(PEcAn.DB)
+library(DBI)
+library(testthat)
+library(dplyr)
+context("SQL insertion helper functions")
+
+if (requireNamespace("RSQLite")) {
+  library(RSQLite)
+  iris_file <- tempfile(fileext = ".sqlite")
+  irisdb <- dbConnect(SQLite(), iris_file)
+  iris <- as_tibble(iris) %>%
+    mutate(Species = as.character(Species))
+
+  copy_to(irisdb, iris[1,], "iris", overwrite = TRUE)
+  iris_insert <- iris[2:10,]
+  .insert <- insert_table(iris_insert, "iris", irisdb)
+  iris_insert_test <- tbl(irisdb, "iris") %>% collect()
+  test_that(
+    "Subset of iris was inserted into database",
+    {
+      expect_equal(iris[1:10,], iris_insert_test)
+    }
+  )
+
+  iris_merge <- iris[5:12,]
+  out_merge <- db_merge_into(iris_merge, "iris", irisdb)
+  iris_merge_nrow <- tbl(irisdb, "iris") %>%
+    count() %>%
+    pull(n)
+  test_that(
+    "Only subset of iris data were merged",
+    {
+      expect_equal(collect(out_merge), iris[5:12,])
+    }
+  )
+
+} else {
+  message("Skipping insert tests because `RSQLite` not installed.")
+}
+

From 72af94edaf7435da9db25f6d73f231d9ac9540ef Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 15:18:34 -0600
Subject: [PATCH 09/18] DB: Bugfix `insert_table`

Only use columns that match the SQL database.
---
 base/db/R/db_merge_into.R            |  6 +++---
 base/db/R/insert_table.R             |  2 +-
 base/db/tests/testthat/test.insert.R | 19 ++++++++++++++++---
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R
index 8f9d7dec52f..a387769cf96 100644
--- a/base/db/R/db_merge_into.R
+++ b/base/db/R/db_merge_into.R
@@ -20,11 +20,11 @@ db_merge_into <- function(values, table, con, by = NULL) {
   values_merge <- dplyr::anti_join(values, sql_tbl, by = by, copy = TRUE)
   if (nrow(values_merge) < 1 || ncol(values_merge) < 1) {
     PEcAn.logger::logger.warn(
-      "Input table for merge is empty. Skipping operation and returning NULL."
+      "Input table for merge is empty."
     )
-    return(NULL)
+  } else {
+    insert <- insert_table(values_merge, table, con)
   }
-  insert <- insert_table(values_merge, table, con)
   dplyr::tbl(con, table) %>%
     dplyr::inner_join(values, copy = TRUE)
 }
diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
index 0d833526d23..ca0f9593304 100644
--- a/base/db/R/insert_table.R
+++ b/base/db/R/insert_table.R
@@ -27,7 +27,7 @@ insert_table <- function(values, table, con) {
     paste(use_cols, collapse = ", ")
   )
   values_sub <- values[, use_cols]
-  insert_query <- build_insert_query(values, table, con = con)
+  insert_query <- build_insert_query(values_sub, table, con = con)
   db.query(insert_query, con)
 }
 
diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R
index 76509ccb282..6df2179c156 100644
--- a/base/db/tests/testthat/test.insert.R
+++ b/base/db/tests/testthat/test.insert.R
@@ -9,10 +9,14 @@ if (requireNamespace("RSQLite")) {
   iris_file <- tempfile(fileext = ".sqlite")
   irisdb <- dbConnect(SQLite(), iris_file)
   iris <- as_tibble(iris) %>%
-    mutate(Species = as.character(Species))
+    mutate(
+      Species = as.character(Species)
+    )
 
   copy_to(irisdb, iris[1,], "iris", overwrite = TRUE)
-  iris_insert <- iris[2:10,]
+  # Add extra column to see if it's successfully ignored
+  iris2 <- mutate(iris, extracol = row_number())
+  iris_insert <- iris2[2:10,]
   .insert <- insert_table(iris_insert, "iris", irisdb)
   iris_insert_test <- tbl(irisdb, "iris") %>% collect()
   test_that(
@@ -22,8 +26,9 @@ if (requireNamespace("RSQLite")) {
     }
   )
 
-  iris_merge <- iris[5:12,]
+  iris_merge <- iris2[5:12,]
   out_merge <- db_merge_into(iris_merge, "iris", irisdb)
+  out_merge2 <- db_merge_into(iris_merge, "iris", irisdb)
   iris_merge_nrow <- tbl(irisdb, "iris") %>%
     count() %>%
     pull(n)
@@ -31,6 +36,14 @@ if (requireNamespace("RSQLite")) {
     "Only subset of iris data were merged",
     {
       expect_equal(collect(out_merge), iris[5:12,])
+      expect_equal(collect(out_merge), collect(out_merge2))
+    }
+  )
+
+  test_that(
+    "Extra column (not in SQL) was retained in `out_merge`",
+    {
+      expect_true("extracol" %in% colnames(out_merge))
     }
   )
 

From fde7c43d69ff25eafa80d26750289b230217dfd3 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 15:23:03 -0600
Subject: [PATCH 10/18] DB: Add referenceID to try.sqlite

---
 base/db/R/try2sqlite.R                  | 11 +++--------
 base/db/inst/import-try/01_try_sqlite.R |  8 ++++----
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R
index ffbcb9fe0bc..595a823b4d1 100644
--- a/base/db/R/try2sqlite.R
+++ b/base/db/R/try2sqlite.R
@@ -20,11 +20,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
     data.table::rbindlist()
 
   # Create integer reference ID for compact storage
-  data.table::set(
-    raw_data,
-    j = "ReferenceID",
-    value = as.integer(factor(raw_data[["Reference"]]))
-  )
+  raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]]))
 
   # Create tables
   data_cols <- c(
@@ -51,11 +47,10 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
     "Dataset",
     "LastName",
     "FirstName",
-    "Reference"
+    "Reference",
+    "ReferenceID"
   )
   datasets_values <- unique(raw_data[, datasets_cols, with = FALSE])
-  # Add DOI column, to be filled later
-  datasets_values[["doi"]] <- NA_character_
 
   traits_cols <- c(
     "DataID",
diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R
index 6dcd932b343..450f1cbbde1 100644
--- a/base/db/inst/import-try/01_try_sqlite.R
+++ b/base/db/inst/import-try/01_try_sqlite.R
@@ -1,5 +1,6 @@
 # Create the TRY SQLite database
 library(PEcAn.DB)
+library(PEcAn.logger)
 
 # Create the TRY SQLite database
 try_files <- "~/try-data/1829.txt"
@@ -10,9 +11,8 @@ if (!exists("overwrite")) {
 }
 
 if (!file.exists(sqlite_file) || overwrite) {
+  file.remove(sqlite_file)
   try2sqlite(try_files, sqlite_file)
-}
-
-if (!exists("force_add_doi")) {
-  force_add_doi <- FALSE
+} else {
+  logger.info("TRY SQLite database already exists and `overwrite` is FALSE. ")
 }

From 998d5b32e13704e43e8fa1d18aef4b0e9d24b577 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 1 Feb 2018 15:50:11 -0600
Subject: [PATCH 11/18] DB: Check column classes in `insert_table`

---
 base/db/R/insert_table.R    | 39 +++++++++++++++++++++++++++++++++----
 base/db/man/insert_table.Rd | 11 ++++++++---
 2 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
index ca0f9593304..b21b799f904 100644
--- a/base/db/R/insert_table.R
+++ b/base/db/R/insert_table.R
@@ -1,10 +1,14 @@
 #' Insert R data frame into SQL database
 #'
-#' First, subset to matching columns. Then, build an SQL string for the insert 
-#' statement. Finally, insert into the database.
+#' First, subset to matching columns. Then, make sure the local and SQL column 
+#' classes match, coercing local to SQL as necessary (or throwing an error). 
+#' Then, build an SQL string for the insert statement. Finally, insert into the 
+#' database.
 #'
 #' @param values `data.frame` of values to write to SQL database
 #' @param table Name of target SQL table, as character
+#' @param coerce_col_class logical, whether or not to coerce local data columns 
+#' to SQL classes. Default = `TRUE.`
 #' @inheritParams db.query
 #' @inherit db.query return
 #' @export
@@ -15,7 +19,7 @@
 #' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
 #' insert_table(iris[-1,], "iris", irisdb$con)
 #' tbl(irisdb, "iris")
-insert_table <- function(values, table, con) {
+insert_table <- function(values, table, con, coerce_col_class = TRUE) {
   use_cols <- match_dbcols(values, table, con)
   if (length(use_cols) < 1) {
     PEcAn.logger::logger.severe(
@@ -27,7 +31,34 @@ insert_table <- function(values, table, con) {
     paste(use_cols, collapse = ", ")
   )
   values_sub <- values[, use_cols]
-  insert_query <- build_insert_query(values_sub, table, con = con)
+  # Load one row to get column types
+  sql_row <- dplyr::tbl(con, table) %>% head(1) %>% collect()
+  sql_types <- purrr::map(sql_row, class) %>%
+    purrr::map_chr(1) %>%
+    .[use_cols]
+  values_types <- purrr::map(values_sub, class) %>% purrr::map_chr(1)
+  type_mismatch <- sql_types != values_types
+  if (sum(type_mismatch) > 0) {
+    mismatch_string <- sprintf(
+      "%s: local is %s, SQL is %s",
+      names(values_types),
+      values_types,
+      sql_types
+    )[type_mismatch]
+    PEcAn.logger::logger.info(
+      "Found type mismatches in the following columns: ",
+      paste0(mismatch_string, collapse = "; ")
+    )
+    if (!coerce_col_class) {
+      PEcAn.logger::logger.severe(
+        "Type mismatch detected, and `coerce_col_class` is `FALSE`. ",
+        "Fix column class mismatches manually."
+      )
+    }
+  }
+  # Coerce values data frame to these types
+  values_fixed <- purrr::map2_dfc(values_sub, sql_types, as)
+  insert_query <- build_insert_query(values_fixed, table, con = con)
   db.query(insert_query, con)
 }
 
diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd
index 01e392b842a..54c7f3d7320 100644
--- a/base/db/man/insert_table.Rd
+++ b/base/db/man/insert_table.Rd
@@ -4,7 +4,7 @@
 \alias{insert_table}
 \title{Insert R data frame into SQL database}
 \usage{
-insert_table(values, table, con)
+insert_table(values, table, con, coerce_col_class = TRUE)
 }
 \arguments{
 \item{values}{`data.frame` of values to write to SQL database}
@@ -12,13 +12,18 @@ insert_table(values, table, con)
 \item{table}{Name of target SQL table, as character}
 
 \item{con}{database connection object}
+
+\item{coerce_col_class}{logical, whether or not to coerce local data columns 
+to SQL classes. Default = `TRUE.`}
 }
 \value{
 data frame with query results
 }
 \description{
-First, subset to matching columns. Then, build an SQL string for the insert 
-statement. Finally, insert into the database.
+First, subset to matching columns. Then, make sure the local and SQL column 
+classes match, coercing local to SQL as necessary (or throwing an error). 
+Then, build an SQL string for the insert statement. Finally, insert into the 
+database.
 }
 \examples{
 library(dplyr)

From 1e9515ba588eb49770101ccf9f09d0d8e853ca64 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Fri, 2 Feb 2018 08:57:39 -0600
Subject: [PATCH 12/18] DB: Miscellaneous bugfixes to last few commits

---
 base/db/NAMESPACE                      |  1 +
 base/db/R/db_merge_into.R              |  7 +++---
 base/db/R/insert_table.R               | 31 ++++++++++++++++++++------
 base/db/inst/import-try/02_citations.R | 13 ++++++-----
 base/db/man/match_colnames.Rd          | 18 +++++++++++++++
 base/db/man/match_dbcols.Rd            | 12 +++++++---
 6 files changed, 64 insertions(+), 18 deletions(-)
 create mode 100644 base/db/man/match_colnames.Rd

diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index a10100f4152..67a63071aa8 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -35,6 +35,7 @@ export(get_var_names)
 export(get_workflow_ids)
 export(insert_table)
 export(load_data_single_run)
+export(match_dbcols)
 export(ncdays2date)
 export(query.file.path)
 export(query.format.vars)
diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R
index a387769cf96..eac93e184ab 100644
--- a/base/db/R/db_merge_into.R
+++ b/base/db/R/db_merge_into.R
@@ -13,11 +13,12 @@
 #' db_merge_into(iris[1:12,], "iris", irisdb)
 #' tbl(irisdb, "iris") %>% count()
 db_merge_into <- function(values, table, con, by = NULL) {
+  values_fixed <- match_dbcols(values, table, con)
   if (is.null(by)) {
-    by <- match_dbcols(values, table, con)
+    by <- colnames(values_fixed)
   }
   sql_tbl <- dplyr::tbl(con, table)
-  values_merge <- dplyr::anti_join(values, sql_tbl, by = by, copy = TRUE)
+  values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE)
   if (nrow(values_merge) < 1 || ncol(values_merge) < 1) {
     PEcAn.logger::logger.warn(
       "Input table for merge is empty."
@@ -26,5 +27,5 @@ db_merge_into <- function(values, table, con, by = NULL) {
     insert <- insert_table(values_merge, table, con)
   }
   dplyr::tbl(con, table) %>%
-    dplyr::inner_join(values, copy = TRUE)
+    dplyr::inner_join(values_fixed, copy = TRUE)
 }
diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
index b21b799f904..8d66449aaec 100644
--- a/base/db/R/insert_table.R
+++ b/base/db/R/insert_table.R
@@ -20,7 +20,19 @@
 #' insert_table(iris[-1,], "iris", irisdb$con)
 #' tbl(irisdb, "iris")
 insert_table <- function(values, table, con, coerce_col_class = TRUE) {
-  use_cols <- match_dbcols(values, table, con)
+  values_fixed <- match_dbcols(values, table, con, coerce_col_class)
+  insert_query <- build_insert_query(values_fixed, table, con = con)
+  print(insert_query)
+  db.query(insert_query, con)
+}
+
+#' Match column names and classes between local and SQL table
+#'
+#' @inheritParams insert_table
+#' @return `values` `data.frame` with column names and classes matched to SQL
+#' @export
+match_dbcols <- function(values, table, con, coerce_col_class = TRUE) {
+  use_cols <- match_colnames(values, table, con)
   if (length(use_cols) < 1) {
     PEcAn.logger::logger.severe(
       "No columns match between input and target table."
@@ -54,18 +66,23 @@ insert_table <- function(values, table, con, coerce_col_class = TRUE) {
         "Type mismatch detected, and `coerce_col_class` is `FALSE`. ",
         "Fix column class mismatches manually."
       )
+    } else {
+      PEcAn.logger::logger.info(
+        "Coercing local column types to match SQL."
+      )
+      # Coerce values data frame to these types
+      values_fixed <- purrr::map2_dfc(values_sub, sql_types, as)
     }
+  } else {
+    values_fixed <- values_sub
   }
-  # Coerce values data frame to these types
-  values_fixed <- purrr::map2_dfc(values_sub, sql_types, as)
-  insert_query <- build_insert_query(values_fixed, table, con = con)
-  db.query(insert_query, con)
+  values_fixed
 }
 
 #' Match names of local data frame to SQL table
 #'
 #' @inheritParams insert_table
-match_dbcols <- function(values, table, con) {
+match_colnames <- function(values, table, con) {
   tbl_db <- dplyr::tbl(con, table)
   table_cols <- dplyr::tbl_vars(tbl_db)
   values_cols <- colnames(values)
@@ -89,7 +106,7 @@ build_insert_query <- function(values, table, ...) {
     dbplyr::sql(" "),
     dbplyr::ident(table),
     dbplyr::sql(" "),
-    dbplyr::sql_vector(dbplyr::escape(colnames(values))),
+    dbplyr::sql_vector(ident(colnames(values)), collapse = ", "),
     dbplyr::sql(" "),
     dbplyr::sql("VALUES"),
     dbplyr::sql(" "),
diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R
index b831c40ddda..beb3e7c6b0b 100644
--- a/base/db/inst/import-try/02_citations.R
+++ b/base/db/inst/import-try/02_citations.R
@@ -11,13 +11,16 @@ logger.setLevel("DEBUG")
 sqlite_file <- "try.sqlite"
 
 trydb <- dbConnect(SQLite(), sqlite_file)
-references <- trydb %>%
-  tbl("datasets") %>%
-  distinct(Reference) %>%
-  pull(Reference)
+reference_dat <- tbl(trydb, "datasets") %>%
+  distinct(Reference, ReferenceID) %>%
+  collect()
 
-try_refs <- search_references(references, min_score = 1)
+refs_proc <- reference_dat %>%
+  mutate(cr_df = map(Reference, search_references, min_score = 40)) %>%
+  unnest()
 
 source("inst/import-try/bety_connect.R")
+bety_refs <- db_merge_into(refs_proc, "citations", bety, "doi") %>%
+  collect()
 
 # TODO: Use object `bety`, table `citations`
diff --git a/base/db/man/match_colnames.Rd b/base/db/man/match_colnames.Rd
new file mode 100644
index 00000000000..38450a89bfa
--- /dev/null
+++ b/base/db/man/match_colnames.Rd
@@ -0,0 +1,18 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/insert_table.R
+\name{match_colnames}
+\alias{match_colnames}
+\title{Match names of local data frame to SQL table}
+\usage{
+match_colnames(values, table, con)
+}
+\arguments{
+\item{values}{`data.frame` of values to write to SQL database}
+
+\item{table}{Name of target SQL table, as character}
+
+\item{con}{database connection object}
+}
+\description{
+Match names of local data frame to SQL table
+}
diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd
index d70b74717ae..490e5f6b58a 100644
--- a/base/db/man/match_dbcols.Rd
+++ b/base/db/man/match_dbcols.Rd
@@ -2,9 +2,9 @@
 % Please edit documentation in R/insert_table.R
 \name{match_dbcols}
 \alias{match_dbcols}
-\title{Match names of local data frame to SQL table}
+\title{Match column names and classes between local and SQL table}
 \usage{
-match_dbcols(values, table, con)
+match_dbcols(values, table, con, coerce_col_class = TRUE)
 }
 \arguments{
 \item{values}{`data.frame` of values to write to SQL database}
@@ -12,7 +12,13 @@ match_dbcols(values, table, con)
 \item{table}{Name of target SQL table, as character}
 
 \item{con}{database connection object}
+
+\item{coerce_col_class}{logical, whether or not to coerce local data columns 
+to SQL classes. Default = `TRUE.`}
+}
+\value{
+`values` `data.frame` with column names and classes matched to SQL
 }
 \description{
-Match names of local data frame to SQL table
+Match column names and classes between local and SQL table
 }

From 0f3b9c75b8475195129f240c23ed660b150386ac Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Fri, 2 Feb 2018 10:55:37 -0500
Subject: [PATCH 13/18] DB: Add more logging to try2sqlite

When the data are huge, it's nice to get more regular status updates.
---
 base/db/R/try2sqlite.R | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R
index 595a823b4d1..a95f96aa08c 100644
--- a/base/db/R/try2sqlite.R
+++ b/base/db/R/try2sqlite.R
@@ -16,13 +16,16 @@
 #' @export
 try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
   # Read files
+  PEcAn.logger::logger.info("Reading in TRY data...")
   raw_data <- Map(data.table::fread, try_files) %>%
     data.table::rbindlist()
 
   # Create integer reference ID for compact storage
+  PEcAn.logger::logger.info("Adding ReferenceID column")
   raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]]))
 
   # Create tables
+  PEcAn.logger::logger.info("Extracting data values table.")
   data_cols <- c(
     "ObsDataID",        # TRY row ID -- unique to each observation of a given trait
     "ObservationID",    # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf)
@@ -42,6 +45,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
   )
   data_values <- unique(raw_data[, data_cols, with = FALSE])
 
+  PEcAn.logger::logger.info("Extrating datasets table...")
   datasets_cols <- c(
     "DatasetID",
     "Dataset",
@@ -52,6 +56,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
   )
   datasets_values <- unique(raw_data[, datasets_cols, with = FALSE])
 
+  PEcAn.logger::logger.info("Extracting traits table...")
   traits_cols <- c(
     "DataID",
     "DataName",
@@ -60,6 +65,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
   )
   traits_values <- unique(raw_data[, traits_cols, with = FALSE])
 
+  PEcAn.logger::logger.info("Extracting species table...")
   species_cols <- c(
     "AccSpeciesID",
     "AccSpeciesName",
@@ -67,12 +73,19 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
   )
   species_values <- unique(raw_data[, species_cols, with = FALSE])
 
+  PEcAn.logger::logger.info("Writing tables to SQLite database...")
   con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
   on.exit(DBI::dbDisconnect(con))
+  PEcAn.logger::logger.info("Writing values table...")
   DBI::dbWriteTable(con, "values", data_values)
+  PEcAn.logger::logger.info("Writing traits table...")
   DBI::dbWriteTable(con, "traits", data_values)
+  PEcAn.logger::logger.info("Writing datasets table...")
   DBI::dbWriteTable(con, "datasets", datasets_values)
+  PEcAn.logger::logger.info("Writing species table...")
   DBI::dbWriteTable(con, "species", species_values)
 
+  PEcAn.logger::logger.info("Done creating TRY SQLite database!")
+
   NULL
 }

From 90ac2ba0ff0beb8724d5963fabf6b6b7f097da5c Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Fri, 2 Feb 2018 10:57:52 -0500
Subject: [PATCH 14/18] DB: Improve directory management in TRY workflow

Use the `here` package to always work relative to `db` package
directory. Also, set paths and connections in a `config.R` script.
---
 base/db/.here                           |  0
 base/db/inst/import-try/01_try_sqlite.R |  6 +++---
 base/db/inst/import-try/config.R        | 14 ++++++++++++++
 3 files changed, 17 insertions(+), 3 deletions(-)
 create mode 100644 base/db/.here
 create mode 100644 base/db/inst/import-try/config.R

diff --git a/base/db/.here b/base/db/.here
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/base/db/inst/import-try/01_try_sqlite.R b/base/db/inst/import-try/01_try_sqlite.R
index 450f1cbbde1..9480ea8687e 100644
--- a/base/db/inst/import-try/01_try_sqlite.R
+++ b/base/db/inst/import-try/01_try_sqlite.R
@@ -1,10 +1,10 @@
 # Create the TRY SQLite database
 library(PEcAn.DB)
 library(PEcAn.logger)
+library(here)
 
-# Create the TRY SQLite database
-try_files <- "~/try-data/1829.txt"
-sqlite_file <- "try.sqlite"
+configfile <- here("inst", "import-try", "config.R")
+source(configfile)
 
 if (!exists("overwrite")) {
   overwrite <- FALSE
diff --git a/base/db/inst/import-try/config.R b/base/db/inst/import-try/config.R
new file mode 100644
index 00000000000..eff2c8ce7a1
--- /dev/null
+++ b/base/db/inst/import-try/config.R
@@ -0,0 +1,14 @@
+# Character vector containing TRY file paths
+try_files <- c("~/Projects/try-data/4143.txt")
+stopifnot(all(file.exists(try_files)))
+
+# Path to generated TRY SQLite file
+sqlite_file <- "inst/import-try/try.sqlite"
+
+# Bety connection configuration
+betyparams <- list(
+  host = "localhost",
+  dbname = "bety",
+  user = "bety",
+  password = "bety"
+)

From 951012399032ba4f2e31e33f8192223f3b52e40d Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Fri, 2 Feb 2018 12:26:23 -0600
Subject: [PATCH 15/18] DB: Clean up TRY citations script

---
 base/db/DESCRIPTION                    |  3 +-
 base/db/inst/import-try/02_citations.R | 52 +++++++++++++++++++++-----
 2 files changed, 45 insertions(+), 10 deletions(-)

diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
index 8dabca46856..654a80817ee 100644
--- a/base/db/DESCRIPTION
+++ b/base/db/DESCRIPTION
@@ -29,7 +29,8 @@ Suggests:
     testthat (>= 1.0.2),
     tidyverse,
     data.table,
-    rcrossref
+    rcrossref,
+    here
 License: FreeBSD + file LICENSE
 Copyright: Authors
 LazyLoad: yes
diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R
index beb3e7c6b0b..453d65dd007 100644
--- a/base/db/inst/import-try/02_citations.R
+++ b/base/db/inst/import-try/02_citations.R
@@ -5,22 +5,56 @@ library(RSQLite)
 library(rcrossref)
 library(PEcAn.logger)
 library(PEcAn.DB)
+library(here)
 
-logger.setLevel("DEBUG")
+wd <- here("inst", "import-try")
+configfile <- file.path(wd, "config.R")
+source(configfile)
+
+data_dir <- file.path(wd, "data-proc")
+dir.create(data_dir, showWarnings = FALSE)
 
-sqlite_file <- "try.sqlite"
+logger.setLevel("DEBUG")
 
 trydb <- dbConnect(SQLite(), sqlite_file)
 reference_dat <- tbl(trydb, "datasets") %>%
   distinct(Reference, ReferenceID) %>%
   collect()
 
-refs_proc <- reference_dat %>%
-  mutate(cr_df = map(Reference, search_references, min_score = 40)) %>%
-  unnest()
+refs_proc_file <- file.path(data_dir, "refs_proc.rds")
+if (file.exists(refs_proc_file)) {
+  refs_proc <- readRDS(refs_proc_file)
+} else {
+  refs_proc <- reference_dat %>%
+    mutate(cr_df = map(Reference, search_references, min_score = 40)) %>%
+    unnest()
+  saveRDS(refs_proc, refs_proc_file)
+}
 
-source("inst/import-try/bety_connect.R")
-bety_refs <- db_merge_into(refs_proc, "citations", bety, "doi") %>%
-  collect()
+# Replace bad matches with NA
+minscore <- 85
+fill_na <- function(field, score) {
+  na <- as(NA, class(field))
+  if_else(score > minscore, field, na)
+}
+refs_proc2 <- refs_proc %>%
+  mutate_at(
+    c("author", "year", "journal", "vol", "pg", "doi"),
+    fill_na,
+    score = .$score
+  ) %>%
+  mutate(
+    title = if_else(score > minscore, title, Reference)
+  ) %>%
+  # TODO: Hack until Bety author column is extended
+  mutate_if(is.character, substr, start = 0, stop = 254) %>%
+  # TODO: Hack until Bety constraints on these columns are relaxed
+  select(-pg)
 
-# TODO: Use object `bety`, table `citations`
+
+# Check character column length
+
+bety <- db.open(betyparams)
+
+bety_refs <- db_merge_into(refs_proc2, "citations", bety) %>%
+  collect()

From 717c47711a6be9e5282451e22aeb45a9640418e4 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Wed, 7 Feb 2018 15:58:48 -0500
Subject: [PATCH 16/18] DB: Working TRY citations import

---
 base/db/.gitignore                     |  1 +
 base/db/R/search_references.R          |  9 +++---
 base/db/R/zz.imports.R                 |  7 +++--
 base/db/inst/import-try/02_citations.R | 39 +++++++++++++++++---------
 4 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/base/db/.gitignore b/base/db/.gitignore
index bcf2011eddd..c1e2f2a3c7b 100644
--- a/base/db/.gitignore
+++ b/base/db/.gitignore
@@ -1 +1,2 @@
 try.sqlite
+inst/import-try/data-proc
diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R
index 86f5e4150f2..14b61adc7bb 100644
--- a/base/db/R/search_references.R
+++ b/base/db/R/search_references.R
@@ -53,14 +53,15 @@ search_reference_single <- function(query, limit = 1, min_score = 85) {
   )
   proc_search <- crdata %>%
     dplyr::mutate(
-      author = purrr::map_chr(
-        author,
-        ~paste(.[["given"]], .[["family"]], sep = " ", collapse = ", ")),
+      # Get the first author only -- this is the BETY format
+      author_family = purrr::map(author, list("family", 1)),
+      author_given = purrr::map(author, list("given", 1)),
+      author = paste(author_family, author_given, sep = ", "),
       year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(),
       query = query,
       score = as.numeric(score)
     )
   use_cols <- keep_cols[keep_cols %in% colnames(proc_search)]
-  select(proc_search, !!!use_cols)
+  dplyr::select(proc_search, !!!use_cols)
 }
 
diff --git a/base/db/R/zz.imports.R b/base/db/R/zz.imports.R
index 050b5a37cf9..fbdf9b818aa 100644
--- a/base/db/R/zz.imports.R
+++ b/base/db/R/zz.imports.R
@@ -1,3 +1,4 @@
-##' Imports from other packages
-##'
-##' @importFrom magrittr `%>%`
+#' Imports from other packages
+#'
+#' @importFrom magrittr `%>%`
+#' @importFrom rlang `!!` `!!!`
diff --git a/base/db/inst/import-try/02_citations.R b/base/db/inst/import-try/02_citations.R
index 453d65dd007..98c1dc56758 100644
--- a/base/db/inst/import-try/02_citations.R
+++ b/base/db/inst/import-try/02_citations.R
@@ -14,9 +14,18 @@ source(configfile)
 data_dir <- file.path(wd, "data-proc")
 dir.create(data_dir, showWarnings = FALSE)
 
-logger.setLevel("DEBUG")
+bety <- db.open(betyparams)
+if (!"notes" %in% dbListFields(bety, "citations")) {
+  logger.severe(
+    "`notes` column required in Bety citations table ",
+    "but not found in this version of Bety. ",
+    "Please make sure you have performed schema migration ",
+    "version 20180206152600 (relax_citations)."
+  )
+}
 
 trydb <- dbConnect(SQLite(), sqlite_file)
+
 reference_dat <- tbl(trydb, "datasets") %>%
   distinct(Reference, ReferenceID) %>%
   collect()
@@ -25,9 +34,11 @@ refs_proc_file <- file.path(data_dir, "refs_proc.rds")
 if (file.exists(refs_proc_file)) {
   refs_proc <- readRDS(refs_proc_file)
 } else {
+  logger.setLevel("DEBUG")    # To get status messages
   refs_proc <- reference_dat %>%
     mutate(cr_df = map(Reference, search_references, min_score = 40)) %>%
     unnest()
+  logger.setLevel("INFO")
   saveRDS(refs_proc, refs_proc_file)
 }
 
@@ -39,22 +50,22 @@ fill_na <- function(field, score) {
 }
 refs_proc2 <- refs_proc %>%
   mutate_at(
-    c("author", "year", "journal", "vol", "pg", "doi"),
+    c("title", "author", "year", "journal", "vol", "pg", "doi"),
     fill_na,
     score = .$score
   ) %>%
   mutate(
-    title = if_else(score > minscore, title, Reference)
-  ) %>%
-  # TODO: Hack until Bety author column is extended
-  mutate_if(is.character, substr, start = 0, stop = 254) %>%
-  # TODO: Hack until Bety constraints on these columns are relaxed
-  select(-pg)
-
-
-# Check character column length
-
-bety <- db.open(betyparams)
+    title = if_else(!is.na(title), title, paste0("TRY ReferenceID ", ReferenceID)),
+    author = if_else(!is.na(author), author, "Unknown TRY data (see title)"),
+    author = substr(author, 0, 254),    # Trim author to 255 characters
+    journal = if_else(!is.na(journal), journal, "Unknown TRY data (see title)"),
+    # Use the Kattge 2007 TRY paper's DOI as a placeholder
+    doi = if_else(!is.na(doi), doi, "10.1111/j.1365-2486.2011.02451.x"),
+    year = if_else(!is.na(year), year, 2018),
+    pg = if_else(!is.na(pg), pg, "9999"),
+    notes = paste("Original TRY reference: ", Reference)
+  )
 
-bety_refs <- db_merge_into(refs_proc2, "citations", bety) %>%
+bety_refs <- db_merge_into(refs_proc2, "citations", bety, "notes") %>%
   collect()
+saveRDS(bety_refs, file.path(data_dir, "refs_bety.rds"))

From 2e889034d5b930e2e199bcf892f82529979466c7 Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Wed, 7 Feb 2018 16:22:19 -0500
Subject: [PATCH 17/18] DB: Fix try2sqlite bug

---
 base/db/R/try2sqlite.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R
index a95f96aa08c..828afd00957 100644
--- a/base/db/R/try2sqlite.R
+++ b/base/db/R/try2sqlite.R
@@ -79,7 +79,7 @@ try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
   PEcAn.logger::logger.info("Writing values table...")
   DBI::dbWriteTable(con, "values", data_values)
   PEcAn.logger::logger.info("Writing traits table...")
-  DBI::dbWriteTable(con, "traits", data_values)
+  DBI::dbWriteTable(con, "traits", traits_values)
   PEcAn.logger::logger.info("Writing datasets table...")
   DBI::dbWriteTable(con, "datasets", datasets_values)
   PEcAn.logger::logger.info("Writing species table...")

From 433c96e46a73a32965296f81108595fdc079d4bc Mon Sep 17 00:00:00 2001
From: Alexey Shiklomanov <ashiklom@bu.edu>
Date: Thu, 8 Feb 2018 12:04:06 -0500
Subject: [PATCH 18/18] DB: Address @infotroph PR #1848 comments

Notable changes:
- Use `glue::glue_sql` instead of `dbplyr::build_sql` for generating SQL
query. The syntax is much cleaner. This also fixes the breaking tests by
improving the way column names are quoted.
- Remove the `.here` file
- In tests and examples, use `":memory:"` SQLite database instead of a
temporary file
- Use explicit `package::function` syntax in tests, to test for
NAMESPACE issues
---
 base/db/.here                        |  0
 base/db/DESCRIPTION                  |  4 ++
 base/db/NAMESPACE                    |  1 +
 base/db/R/db_merge_into.R            | 18 +++---
 base/db/R/insert_table.R             | 45 +++++++-------
 base/db/R/zz.imports.R               |  9 ++-
 base/db/man/build_insert_query.Rd    |  4 +-
 base/db/man/db_merge_into.Rd         | 23 +++++---
 base/db/man/insert_table.Rd          | 12 ++--
 base/db/man/match_dbcols.Rd          |  4 +-
 base/db/man/otherimports.Rd          | 19 ++++++
 base/db/tests/testthat/test.insert.R | 88 +++++++++++++---------------
 12 files changed, 129 insertions(+), 98 deletions(-)
 delete mode 100644 base/db/.here
 create mode 100644 base/db/man/otherimports.Rd

diff --git a/base/db/.here b/base/db/.here
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
index 654a80817ee..82366762c69 100644
--- a/base/db/DESCRIPTION
+++ b/base/db/DESCRIPTION
@@ -18,6 +18,10 @@ Imports:
     PEcAn.utils,
     dbplyr (>= 1.2.0),
     dplyr,
+    tibble,
+    purrr,
+    tidyr,
+    glue,
     lubridate,
     magrittr,
     ncdf4,
diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
index 67a63071aa8..01bcbcd748e 100644
--- a/base/db/NAMESPACE
+++ b/base/db/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export("%>%")
 export(append.covariate)
 export(assign.treatments)
 export(bety2pecan)
diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R
index eac93e184ab..a00718e5ac3 100644
--- a/base/db/R/db_merge_into.R
+++ b/base/db/R/db_merge_into.R
@@ -1,21 +1,19 @@
 #' Merge local data frame into SQL table
 #'
 #' @inheritParams insert_table
+#' @inheritDotParams insert_table
 #' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values`
 #' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table)
 #' @export
 #' @examples
-#' library(dplyr)
-#' library(RSQLite)
-#' irisfile <- tempfile(fileext = ".sqlite")
-#' irisdb <- dbConnect(SQLite(), irisfile)
-#' copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
+#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+#' dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
 #' db_merge_into(iris[1:12,], "iris", irisdb)
-#' tbl(irisdb, "iris") %>% count()
-db_merge_into <- function(values, table, con, by = NULL) {
-  values_fixed <- match_dbcols(values, table, con)
+#' dplyr::tbl(irisdb, "iris") %>% dplyr::count()
+db_merge_into <- function(values, table, con, by = NULL, drop = FALSE, ...) {
+  values_fixed <- match_dbcols(values, table, con, drop = FALSE)
   if (is.null(by)) {
-    by <- colnames(values_fixed)
+    by <- match_colnames(values, table, con)
   }
   sql_tbl <- dplyr::tbl(con, table)
   values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE)
@@ -24,7 +22,7 @@ db_merge_into <- function(values, table, con, by = NULL) {
       "Input table for merge is empty."
     )
   } else {
-    insert <- insert_table(values_merge, table, con)
+    insert <- insert_table(values_merge, table, con, ...)
   }
   dplyr::tbl(con, table) %>%
     dplyr::inner_join(values_fixed, copy = TRUE)
diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
index 8d66449aaec..f9e45c488bf 100644
--- a/base/db/R/insert_table.R
+++ b/base/db/R/insert_table.R
@@ -9,20 +9,18 @@
 #' @param table Name of target SQL table, as character
 #' @param coerce_col_class logical, whether or not to coerce local data columns 
 #' to SQL classes. Default = `TRUE.`
+#' @param drop logical. If `TRUE` (default), drop columns not found in SQL table.
 #' @inheritParams db.query
 #' @inherit db.query return
 #' @export
 #' @examples
-#' library(dplyr)
-#' irisfile <- tempfile(fileext = ".sqlite")
-#' irisdb <- src_sqlite(irisfile, create = TRUE)
-#' copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
+#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+#' dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
 #' insert_table(iris[-1,], "iris", irisdb$con)
-#' tbl(irisdb, "iris")
-insert_table <- function(values, table, con, coerce_col_class = TRUE) {
-  values_fixed <- match_dbcols(values, table, con, coerce_col_class)
-  insert_query <- build_insert_query(values_fixed, table, con = con)
-  print(insert_query)
+#' dplyr::tbl(irisdb, "iris")
+insert_table <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) {
+  values_fixed <- match_dbcols(values, table, con, coerce_col_class, drop = TRUE)
+  insert_query <- build_insert_query(values_fixed, table, .con = con)
   db.query(insert_query, con)
 }
 
@@ -31,7 +29,7 @@ insert_table <- function(values, table, con, coerce_col_class = TRUE) {
 #' @inheritParams insert_table
 #' @return `values` `data.frame` with column names and classes matched to SQL
 #' @export
-match_dbcols <- function(values, table, con, coerce_col_class = TRUE) {
+match_dbcols <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) {
   use_cols <- match_colnames(values, table, con)
   if (length(use_cols) < 1) {
     PEcAn.logger::logger.severe(
@@ -44,7 +42,7 @@ match_dbcols <- function(values, table, con, coerce_col_class = TRUE) {
   )
   values_sub <- values[, use_cols]
   # Load one row to get column types
-  sql_row <- dplyr::tbl(con, table) %>% head(1) %>% collect()
+  sql_row <- dplyr::tbl(con, table) %>% head(1) %>% dplyr::collect()
   sql_types <- purrr::map(sql_row, class) %>%
     purrr::map_chr(1) %>%
     .[use_cols]
@@ -76,7 +74,12 @@ match_dbcols <- function(values, table, con, coerce_col_class = TRUE) {
   } else {
     values_fixed <- values_sub
   }
-  values_fixed
+  if (drop) {
+    values_fixed
+  } else {
+    drop_cols <- colnames(values)[!colnames(values) %in% use_cols]
+    dplyr::bind_cols(values_fixed, values[, drop_cols])
+  }
 }
 
 #' Match names of local data frame to SQL table
@@ -92,8 +95,8 @@ match_colnames <- function(values, table, con) {
 #' Build query to insert R data frame into SQL table
 #'
 #' @inheritParams insert_table
-#' @param ... Additional arguments to [dbplyr::build_sql]
-build_insert_query <- function(values, table, ...) {
+#' @inheritParams glue::glue_sql
+build_insert_query <- function(values, table, .con) {
   value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ]))
 
   insert_list <- value_list %>%
@@ -101,15 +104,9 @@ build_insert_query <- function(values, table, ...) {
     purrr::map(dbplyr::escape) %>%
     purrr::map(dbplyr::sql_vector)
 
-  dbplyr::build_sql(
-    dbplyr::sql("INSERT INTO"),
-    dbplyr::sql(" "),
-    dbplyr::ident(table),
-    dbplyr::sql(" "),
-    dbplyr::sql_vector(ident(colnames(values)), collapse = ", "),
-    dbplyr::sql(" "),
-    dbplyr::sql("VALUES"),
-    dbplyr::sql(" "),
-    dbplyr::sql_vector(insert_list, parens = FALSE, collapse = ", "), ...
+  glue::glue_sql(
+    "INSERT INTO {`table`} ({`colnames(values)`*}) ",
+    "VALUES {insert_list*}",
+    .con = .con
   )
 }
diff --git a/base/db/R/zz.imports.R b/base/db/R/zz.imports.R
index fbdf9b818aa..4eb0423a534 100644
--- a/base/db/R/zz.imports.R
+++ b/base/db/R/zz.imports.R
@@ -1,4 +1,9 @@
 #' Imports from other packages
 #'
-#' @importFrom magrittr `%>%`
-#' @importFrom rlang `!!` `!!!`
+#' @name otherimports
+#' @importFrom magrittr %>%
+#' @export
+magrittr::`%>%`
+
+#' @rdname otherimports
+#' @importFrom rlang !! !!!
diff --git a/base/db/man/build_insert_query.Rd b/base/db/man/build_insert_query.Rd
index a11bdc46c84..bbfb23fcfd4 100644
--- a/base/db/man/build_insert_query.Rd
+++ b/base/db/man/build_insert_query.Rd
@@ -4,14 +4,14 @@
 \alias{build_insert_query}
 \title{Build query to insert R data frame into SQL table}
 \usage{
-build_insert_query(values, table, ...)
+build_insert_query(values, table, .con)
 }
 \arguments{
 \item{values}{`data.frame` of values to write to SQL database}
 
 \item{table}{Name of target SQL table, as character}
 
-\item{...}{Additional arguments to [dbplyr::build_sql]}
+\item{.con}{[\code{DBIConnection}]:A DBI connection object obtained from \code{DBI::dbConnect()}.}
 }
 \description{
 Build query to insert R data frame into SQL table
diff --git a/base/db/man/db_merge_into.Rd b/base/db/man/db_merge_into.Rd
index 764dc5a92ad..63752d4b027 100644
--- a/base/db/man/db_merge_into.Rd
+++ b/base/db/man/db_merge_into.Rd
@@ -4,7 +4,7 @@
 \alias{db_merge_into}
 \title{Merge local data frame into SQL table}
 \usage{
-db_merge_into(values, table, con, by = NULL)
+db_merge_into(values, table, con, by = NULL, drop = FALSE, ...)
 }
 \arguments{
 \item{values}{`data.frame` of values to write to SQL database}
@@ -14,6 +14,18 @@ db_merge_into(values, table, con, by = NULL)
 \item{con}{database connection object}
 
 \item{by}{Character vector of columns by which to perform merge. Defaults to all columns in `values`}
+
+\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.}
+
+\item{...}{Arguments passed on to \code{insert_table}
+\describe{
+  \item{values}{`data.frame` of values to write to SQL database}
+  \item{table}{Name of target SQL table, as character}
+  \item{coerce_col_class}{logical, whether or not to coerce local data columns 
+to SQL classes. Default = `TRUE.`}
+  \item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.}
+  \item{con}{database connection object}
+}}
 }
 \value{
 Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table)
@@ -22,11 +34,8 @@ Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy q
 Merge local data frame into SQL table
 }
 \examples{
-library(dplyr)
-library(RSQLite)
-irisfile <- tempfile(fileext = ".sqlite")
-irisdb <- dbConnect(SQLite(), irisfile)
-copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
+irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
 db_merge_into(iris[1:12,], "iris", irisdb)
-tbl(irisdb, "iris") \%>\% count()
+dplyr::tbl(irisdb, "iris") \%>\% dplyr::count()
 }
diff --git a/base/db/man/insert_table.Rd b/base/db/man/insert_table.Rd
index 54c7f3d7320..c51740868ce 100644
--- a/base/db/man/insert_table.Rd
+++ b/base/db/man/insert_table.Rd
@@ -4,7 +4,7 @@
 \alias{insert_table}
 \title{Insert R data frame into SQL database}
 \usage{
-insert_table(values, table, con, coerce_col_class = TRUE)
+insert_table(values, table, con, coerce_col_class = TRUE, drop = TRUE)
 }
 \arguments{
 \item{values}{`data.frame` of values to write to SQL database}
@@ -15,6 +15,8 @@ insert_table(values, table, con, coerce_col_class = TRUE)
 
 \item{coerce_col_class}{logical, whether or not to coerce local data columns 
 to SQL classes. Default = `TRUE.`}
+
+\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.}
 }
 \value{
 data frame with query results
@@ -26,10 +28,8 @@ Then, build an SQL string for the insert statement. Finally, insert into the
 database.
 }
 \examples{
-library(dplyr)
-irisfile <- tempfile(fileext = ".sqlite")
-irisdb <- src_sqlite(irisfile, create = TRUE)
-copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
+irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
 insert_table(iris[-1,], "iris", irisdb$con)
-tbl(irisdb, "iris")
+dplyr::tbl(irisdb, "iris")
 }
diff --git a/base/db/man/match_dbcols.Rd b/base/db/man/match_dbcols.Rd
index 490e5f6b58a..ba0b6ae8d00 100644
--- a/base/db/man/match_dbcols.Rd
+++ b/base/db/man/match_dbcols.Rd
@@ -4,7 +4,7 @@
 \alias{match_dbcols}
 \title{Match column names and classes between local and SQL table}
 \usage{
-match_dbcols(values, table, con, coerce_col_class = TRUE)
+match_dbcols(values, table, con, coerce_col_class = TRUE, drop = TRUE)
 }
 \arguments{
 \item{values}{`data.frame` of values to write to SQL database}
@@ -15,6 +15,8 @@ match_dbcols(values, table, con, coerce_col_class = TRUE)
 
 \item{coerce_col_class}{logical, whether or not to coerce local data columns 
 to SQL classes. Default = `TRUE.`}
+
+\item{drop}{logical. If `TRUE` (default), drop columns not found in SQL table.}
 }
 \value{
 `values` `data.frame` with column names and classes matched to SQL
diff --git a/base/db/man/otherimports.Rd b/base/db/man/otherimports.Rd
new file mode 100644
index 00000000000..dd79f7fbbc3
--- /dev/null
+++ b/base/db/man/otherimports.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/zz.imports.R
+\docType{import}
+\name{otherimports}
+\alias{otherimports}
+\alias{\%>\%}
+\title{Imports from other packages}
+\description{
+Imports from other packages
+}
+\keyword{internal}
+\description{
+These objects are imported from other packages. Follow the links
+below to see their documentation.
+
+\describe{
+  \item{magrittr}{\code{\link[magrittr]{\%>\%}}}
+}}
+
diff --git a/base/db/tests/testthat/test.insert.R b/base/db/tests/testthat/test.insert.R
index 6df2179c156..95dd80a459e 100644
--- a/base/db/tests/testthat/test.insert.R
+++ b/base/db/tests/testthat/test.insert.R
@@ -1,53 +1,49 @@
 library(PEcAn.DB)
-library(DBI)
 library(testthat)
-library(dplyr)
 context("SQL insertion helper functions")
 
-if (requireNamespace("RSQLite")) {
-  library(RSQLite)
-  iris_file <- tempfile(fileext = ".sqlite")
-  irisdb <- dbConnect(SQLite(), iris_file)
-  iris <- as_tibble(iris) %>%
-    mutate(
-      Species = as.character(Species)
+test_that(
+  "RSQLite-dependent tests work",
+  {
+    skip_if_not_installed("RSQLite")
+    irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+    iris <- tibble::as_tibble(iris) %>%
+      dplyr::mutate(
+        Species = as.character(Species)
+      )
+    dplyr::copy_to(irisdb, iris[1,], "iris", overwrite = TRUE)
+    # Add extra column to see if it's successfully ignored
+    iris2 <- dplyr::mutate(iris, extracol = seq_len(nrow(iris)))
+    iris_insert <- iris2[2:10,]
+    .insert <- insert_table(iris_insert, "iris", irisdb)
+    test_that(
+      "Subset of iris was inserted into database",
+      {
+        iris_insert_test <- dplyr::tbl(irisdb, "iris") %>% dplyr::collect()
+        expect_equal(iris[1:10,], iris_insert_test)
+      }
     )
 
-  copy_to(irisdb, iris[1,], "iris", overwrite = TRUE)
-  # Add extra column to see if it's successfully ignored
-  iris2 <- mutate(iris, extracol = row_number())
-  iris_insert <- iris2[2:10,]
-  .insert <- insert_table(iris_insert, "iris", irisdb)
-  iris_insert_test <- tbl(irisdb, "iris") %>% collect()
-  test_that(
-    "Subset of iris was inserted into database",
-    {
-      expect_equal(iris[1:10,], iris_insert_test)
-    }
-  )
-
-  iris_merge <- iris2[5:12,]
-  out_merge <- db_merge_into(iris_merge, "iris", irisdb)
-  out_merge2 <- db_merge_into(iris_merge, "iris", irisdb)
-  iris_merge_nrow <- tbl(irisdb, "iris") %>%
-    count() %>%
-    pull(n)
-  test_that(
-    "Only subset of iris data were merged",
-    {
-      expect_equal(collect(out_merge), iris[5:12,])
-      expect_equal(collect(out_merge), collect(out_merge2))
-    }
-  )
-
-  test_that(
-    "Extra column (not in SQL) was retained in `out_merge`",
-    {
-      expect_true("extracol" %in% colnames(out_merge))
-    }
-  )
-
-} else {
-  message("Skipping insert tests because `RSQLite` not installed.")
-}
+    iris_merge <- iris2[5:12,]
+    out_merge <- db_merge_into(iris_merge, "iris", irisdb) %>%
+      dplyr::collect()
+    iris_merge_nrow <- dplyr::tbl(irisdb, "iris") %>%
+      dplyr::count() %>%
+      dplyr::pull(n)
+    test_that(
+      "Only subset of iris data were merged",
+      {
+        expect_equal(out_merge, iris2[5:12,])
+        out_merge2 <- db_merge_into(iris_merge, "iris", irisdb) %>%
+          dplyr::collect()
+        expect_equal(out_merge, out_merge2)
+      }
+    )
 
+    test_that(
+      "Extra column (not in SQL) was retained in `out_merge`",
+      {
+        expect_true("extracol" %in% colnames(out_merge))
+      }
+    )
+  })