Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions R/bmerge.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,20 @@
}
stopf("Incompatible join types: %s (%s) and %s (%s). Factor columns must join to factor or character columns.", xname, x_merge_type, iname, i_merge_type)
}
handle_complex_merge_type = function(dt, col, side_name, other_name) {
if (any(Im(dt[[col]]) != 0, na.rm = TRUE)) {
stopf("Joining on complex numbers with non-zero imaginary part is not supported. Column: %s", side_name)
}
from_detail = gettext(" (complex with zero imaginary part)")
coerce_col(dt, col, "complex", "double", side_name, other_name, from_detail = from_detail, verbose = verbose)
return("double")

Check warning on line 95 in R/bmerge.R

View workflow job for this annotation

GitHub Actions / lint-r

file=R/bmerge.R,line=95,col=7,[return_linter] Use implicit return behavior; explicit return() is not needed.
}
if (i_merge_type == "complex") {
i_merge_type = handle_complex_merge_type(i, icol, iname, xname)
}
if (x_merge_type == "complex") {
x_merge_type = handle_complex_merge_type(x, xcol, xname, iname)
}
# we check factors first to cater for the case when trying to do rolling joins on factors
if (x_merge_type == i_merge_type) {
if (verbose) catf("%s has same type (%s) as %s. No coercion needed.\n", iname, x_merge_type, xname)
Expand Down
44 changes: 41 additions & 3 deletions inst/tests/tests.Rraw
Original file line number Diff line number Diff line change
Expand Up @@ -15746,7 +15746,7 @@ DT1 = data.table(a = sample(3L, 15L, TRUE) + .1, b=sample(c(TRUE, FALSE, NA), 15
DT2 = data.table(a = sample(3L, 6L, TRUE) + .1, b=sample(c(TRUE, FALSE, NA), 6L, TRUE))
test(2069.32, DT1[DT2, .(y = sum(b, na.rm=TRUE)), by=.EACHI, on=c(a = 'a', b="b")]$y, rep(0L, 6L))
DT = data.table(z = 1i)
test(2069.33, DT[DT, on = 'z'], error = "Type 'complex' is not supported for joining/merging")
test(2069.33, DT[DT, on = 'z'], error = "Joining on complex numbers with non-zero imaginary part is not supported. Column: i.z")

# forder verbose message when !isReallyReal Date, #1738
date_dbl = as.Date(as.double(seq(as.Date("2015-01-01"), as.Date("2015-01-05"), by="days")), origin="1970-01-01")
Expand Down Expand Up @@ -17326,7 +17326,7 @@ test(2182.75, melt(data.table(a=10, b=20), measure.vars=list(n="a"), variable.fa
measurev = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used.
test(2183.00001, melt(DT.wide, measure.vars=measurev()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2)))
measurev = list("foo", "bar")#measurev below should not use this since it is not a function.
test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging")
test(2183.00002, melt(DTid, measure.vars=measurev(list(value.name=NULL, num=as.complex), pattern="([ab])([12])")), error="variable_table does not support column type 'complex' for column 'num'")
test(2183.00004, melt(DTid, measure.vars=measurev(list(value.name=NULL, istr=NULL), pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))
test(2183.00005, melt(DTid, measure.vars=measurev(list(column=NULL, istr=NULL), pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword
iris.dt = data.table(iris)
Expand All @@ -17349,7 +17349,7 @@ test(2183.00060, melt(DTid, measure.vars=measurev(list(letter=myfac, value.name=
measure = function(cols)cols # user-defined function for computing measure.vars, same name as data.table::measure but user-defined version should be used.
test(2183.01, melt(DT.wide, measure.vars=measure()), data.table(variable=factor(c("a2","b1","b2")), value=c(2,1,2)))
measure = list("foo", "bar")#measure below should not use this since it is not a function.
test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="Type 'complex' is not supported for joining/merging")
test(2183.02, melt(DTid, measure.vars=measure(value.name, num=as.complex, pattern="([ab])([12])")), error="variable_table does not support column type 'complex' for column 'num'")
test(2183.03, melt(DTid, measure.vars=structure(list(a=c(NA,"a2"),b=c("b1","b2")), variable_table=data.table(number=as.complex(1:2)))), error="variable_table does not support column type 'complex' for column 'number'")
test(2183.04, melt(DTid, measure.vars=measure(value.name, istr, pattern="([ab])([12])"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))
test(2183.05, melt(DTid, measure.vars=measure(column, istr, pattern="([ab])([12])", multiple.keyword="column"))[order(b)], data.table(id=1, istr=paste(c(1,2)), a=c(NA, 2), b=c(1,2)))#same computation but different multiple.keyword
Expand Down Expand Up @@ -21167,3 +21167,41 @@ test(2317.6, DT1[DF1, on='a', .(d = x.a + i.d)]$d, 5)
test(2317.7, DT1[DF2, on='a', e := i.e]$e, 5)
test(2317.8, DT1[DF2, on='a', e2 := x.a + i.e]$e2, 6)
test(2317.9, DT1[DF2, on='a', .(e = x.a + i.e)]$e, 6)

#6627
test(2318.1, {
DT1 = data.table(a = complex(real = 1:3, imaginary = 0), x = letters[1:3])
DT2 = data.table(a = 2L)
res = DT1[DT2, on = "a"]
identical(res$x, "b") && is.numeric(res$a) && res$a == 2
})
test(2318.2, {
DT1 = data.table(a = complex(real = c(1, 2, 3), imaginary = 0), x = letters[1:3])
DT2 = data.table(a = c(2.0, 3.0)) # double
res = DT1[DT2, on = "a"]
identical(res$x, c("b", "c")) && typeof(res$a) == "double"
})
test(2318.3, {
DT1 = data.table(a = c(1L, 2L, 3L)) # integer
DT2 = data.table(a = complex(real = c(2, 3), imaginary = 0), y = letters[1:2])
res = DT1[DT2, on = "a"]
identical(res$y, c("a", "b")) && typeof(res$a) == "integer"
})
test(2318.4, {
DT1 = data.table(a = complex(real = c(1, 2, 3), imaginary = 0), x = letters[1:3])
DT2 = data.table(a = complex(real = c(2, 3), imaginary = 0))
res = DT1[DT2, on = "a"]
identical(res$x, c("b", "c")) && typeof(res$a) == "complex"
})
test(2318.5, {
DT1 = data.table(a = complex(real = c(1, 2), imaginary = c(0, 1)), x = letters[1:2])
DT2 = data.table(a = 2L)
msg = tryCatch(DT1[DT2, on = "a"], error = function(e) e$message)
grepl("non-zero imaginary part", msg)
})
test(2318.6, {
DT1 = data.table(a = 2L)
DT2 = data.table(a = complex(real = c(1, 2), imaginary = c(0, 1)), y = c("a", "b"))
msg = tryCatch(DT1[DT2, on = "a"], error = function(e) e$message)
grepl("non-zero imaginary part", msg)
})
27 changes: 27 additions & 0 deletions man/data.table.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,12 @@ A \code{data.table} is a \code{list} of vectors, just like a \code{data.frame}.

See the \code{see also} section for the several other \emph{methods} that are available for operating on data.tables efficiently.


A \code{data.table} supports joins on columns of type \code{complex}. The join logic is as follows:
\itemize{
\item If a \code{complex} column contains values with only a zero imaginary part (e.g., \code{10+0i}), it is treated as a \code{double} for the join, allowing it to match with \code{integer} and \code{double} columns successfully.
\item If any value in a \code{complex} join column has a non-zero imaginary part (e.g., \code{10+2i}), the join will stop with an error, as there is no defined way to sort or match such a number against a real number.
Comment on lines +237 to +238
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this feature is desirable. For the new example using current master I get

> products_clean[sales, on = .(id = product_id), nomatch = 0]
Erreur dans bmerge(i, x, leftcols, rightcols, roll, rollends, nomatch, mult,  : 
  typeof x.id (complex) != typeof i.product_id (integer)

which seems totally reasonable and actionable. If I want a join, I would need to convert the types to be the same.

So I would suggest closing this pr and opening a new one that clarifies the documentation. What do you think @jangorecki @MichaelChirico @ben-schwen @aitap ?
(I don't really use complex numbers so I don't understand if this is a typical or desirable use case, but I guess any user is capable of converting complex to another joinable type)

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Personally I would not add examples of joining of complex columns into the manual. It is just too uncommon. Unit tests yes, NEWS entry yes, but manual examples not really.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, thank you for the detailed feedback.
I understand there are two key takeaways from the discussion:

  • Documentation: I will remove the examples from the .Rd file along with the rm() command, and I will open a new PR specifically for the documentation cleanup.
  • Feature Design: There is an ongoing higher-level discussion about whether data.table should automatically coerce complex to double for joins, or if it's better to retain the existing behavior and require the user to handle coercion explicitly.

Please let me know the final decision on the feature design and how you'd like me to proceed. Thanks!

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please ask @MichaelChirico for review because he filed the original issue.

}
}
\references{
\url{https://r-datatable.com} (\code{data.table} homepage)\cr
Expand Down Expand Up @@ -441,6 +447,27 @@ DT[, c(.(y=max(y)), lapply(.SD, min)), by=rleid(v), .SDcols=v:b]
# Support guide and links:
# https://github.com/Rdatatable/data.table/wiki/Support

# Example: Joining with a system that uses complex IDs (#6627)

# Case 1: Joining a "clean" set of products.
# Here, `products_clean` only contains IDs with zero imaginary parts.
products_clean = data.table(id = c(101+0i, 103+0i), name = c("widget", "thingamajig"))
sales = data.table(product_id = c(101, 103), units_sold = c(50, 75))

# This join works because the 'id' column in `products_clean` has no non-zero imaginary parts.
products_clean[sales, on = .(id = product_id), nomatch = 0]

# Case 2: Joining a list that includes "bad" IDs.
# Here, `products_all` contains an ID with a non-zero imaginary part (102+1i).
products_all = data.table(id = c(101+0i, 102+1i, 103+0i), name = c("widget", "gadget", "thingamajig"))

# The join fails because the entire 'id' column is checked first.
try(products_all[sales, on = .(id = product_id), nomatch = 0])
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please remove.
examples should show typical use cases, not errors.


\dontshow{
rm(products_clean, sales, products_all)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we don't have any rm() commands in dontshow in other Rd files, please remove.

}

\dontrun{
if (interactive()) {
vignette(package="data.table") # 9 vignettes
Expand Down
Loading