From 62f274a3a301885831894e325acd285235825e71 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Fri, 3 Jun 2022 22:34:54 +0200 Subject: [PATCH 1/7] Check for ALTREP --- src/assign.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/assign.c b/src/assign.c index 7fb09fa71e..9627dbd394 100644 --- a/src/assign.c +++ b/src/assign.c @@ -497,11 +497,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if (length(rows)==0 && targetlen==vlen && (vlen>0 || nrow==0)) { if ( MAYBE_SHARED(thisvalue) || // set() protects the NAMED of atomic vectors from .Call setting arguments to 2 by wrapping with list (TYPEOF(values)==VECSXP && i>LENGTH(values)-1) || // recycled RHS would have columns pointing to others, #185. - (TYPEOF(values)!=VECSXP && i>0) // assigning the same values to a second column. Have to ensure a copy #2540 + (TYPEOF(values)!=VECSXP && i>0) || // assigning the same values to a second column. Have to ensure a copy #2540 + ALTREP(thisvalue) ) { if (verbose) { - Rprintf(_("RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d, but then is being plonked. length(values)==%d; length(cols)==%d)\n"), - i+1, NAMED(thisvalue), MAYBE_SHARED(thisvalue), length(values), length(cols)); + Rprintf(_("RHS for item %d has been duplicated because NAMED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d)\n"), + i+1, NAMED(thisvalue), MAYBE_SHARED(thisvalue), ALTREP(thisvalue), length(values), length(cols)); } thisvalue = copyAsPlain(thisvalue); // PROTECT not needed as assigned as element to protected list below. } else { From ea1e5af1ea03f5bf459d5bce2fc63a33dd60dba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Fri, 3 Jun 2022 22:35:07 +0200 Subject: [PATCH 2/7] Fix and add tests --- inst/tests/tests.Rraw | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index f453b96208..11fb1c2874 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2273,15 +2273,36 @@ test(753.1, DT[,c("x1","x2"):=4:6, verbose = TRUE], data.table(a=letters[1:3],x= output = "RHS for item 2 has been duplicated") test(753.2, DT[2,x2:=7L], data.table(a=letters[1:3],x=3:1,x1=4:6,x2=c(4L,7L,6L),key="a")) DT = data.table(a=letters[3:1],x=1:3,y=4:6) -test(754.1, DT[,c("x1","y1","x2"):=list(x,y)], error="Supplied 3 columns to be assigned 2 items. Please see NEWS for v1.12.2") -test(754.2, DT[,c("x1","y1","x2"):=list(x,y,x)], data.table(a=letters[3:1],x=1:3,y=4:6,x1=1:3,y1=4:6,x2=1:3)) +test(754.01, DT[,c("x1","y1","x2"):=list(x,y)], error="Supplied 3 columns to be assigned 2 items. Please see NEWS for v1.12.2") +test(754.02, DT[,c("x1","y1","x2"):=list(x,y,x)], data.table(a=letters[3:1],x=1:3,y=4:6,x1=1:3,y1=4:6,x2=1:3)) # And non-recycling i.e. that a single column copy does copy the column DT = data.table(a=1:3) -test(754.3, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3))) -test(754.4, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6))) -test(754.5, DT[,a:=as.character(a),verbose=TRUE], output="Direct plonk.*no copy") +test(754.03, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3))) +test(754.04, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6))) +test(754.05, DT[,a:=as.numeric(a),verbose=TRUE], output="Direct plonk.*no copy") RHS = as.integer(DT$a) -test(754.6, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") +test(754.06, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") +# Expand ALTREPS in assign.c, #5400 +# String conversion gets deferred +test(754.07, DT[,a:=as.character(a),verbose=TRUE], output="RHS for item 1 has been duplicated") +# Executing the code inside of test expands the ALTREP so we repeat the code +# in order to check the result after a further assignment +DT = data.table(a=1:3) +DT[,b:=as.character(a)] +DT[, a:=5L] +test(754.08, DT, data.table(a=5L, b=as.character(1:3))) +# This function returns an ALTREP wrapper if the input is at least length 64 +testFun = function(x) { + x[FALSE] = 1 + x +} +DT = data.table(id=1:64, col1=0, col2=0) +test(754.09, DT[, col1 := testFun(col2), verbose = TRUE], output="RHS for item 1 has been duplicated") +DT = data.table(id=1:64, col1=0, col2=0) +DT[, col1 := testFun(col2)] +DT[, col2 := 999] +test(754.10, DT, data.table(id=1:64, col1=0, col2=999)) +rm(testFun) # Used to test warning on redundant by (#2282) but by=.EACHI has now superseded DT = data.table(a=letters[1:3],b=rep(c("d","e"),each=3),x=1:6,key="a,b") From d8266c5c6495bcf7da5628c77ca2de5959b964d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=A1clav=20Tlap=C3=A1k?= <55213630+tlapak@users.noreply.github.com> Date: Fri, 3 Jun 2022 22:35:17 +0200 Subject: [PATCH 3/7] NEWS Item --- NEWS.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/NEWS.md b/NEWS.md index 19fc575e0d..b29f9f02a8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -550,6 +550,8 @@ 53. `as.data.frame(DT, row.names=)` no longer silently ignores `row.names`, [#5319](https://github.com/Rdatatable/data.table/issues/5319). Thanks to @dereckdemezquita for the fix and PR, and @ben-schwen for guidance. +54. `data.table` did not always expand ALTREP columns when assigning a full column by reference. This could result in the target column getting modified unintentionally if the next call to the data.table was a modification by reference of the source column. E.g. in `DT[, b := as.character(a)]` the string conversion gets deferred and subsequent modification of column `a` would also modify column `b`, [#5400](https://github.com/Rdatatable/data.table/issues/5400). Thanks to @aquasync for the report and Václav Tlapák for the PR. + ## NOTES 1. New feature 29 in v1.12.4 (Oct 2019) introduced zero-copy coercion. Our thinking is that requiring you to get the type right in the case of `0` (type double) vs `0L` (type integer) is too inconvenient for you the user. So such coercions happen in `data.table` automatically without warning. Thanks to zero-copy coercion there is no speed penalty, even when calling `set()` many times in a loop, so there's no speed penalty to warn you about either. However, we believe that assigning a character value such as `"2"` into an integer column is more likely to be a user mistake that you would like to be warned about. The type difference (character vs integer) may be the only clue that you have selected the wrong column, or typed the wrong variable to be assigned to that column. For this reason we view character to numeric-like coercion differently and will warn about it. If it is correct, then the warning is intended to nudge you to wrap the RHS with `as.()` so that it is clear to readers of your code that a coercion from character to that type is intended. For example : From 201a1f816c4b7ca7d837b34f1a5640b9ccb738c7 Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 27 Jun 2025 17:40:01 +0000 Subject: [PATCH 4/7] add a new forward-compatibility test --- inst/tests/tests.Rraw | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index d43cff1eff..62db45c999 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2292,24 +2292,26 @@ RHS = as.integer(DT$a) test(754.06, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") # Expand ALTREPS in assign.c, #5400 # String conversion gets deferred -test(754.07, DT[,a:=as.character(a),verbose=TRUE], output="RHS for item 1 has been duplicated") +## first, a regression test of R itself -- we want to make sure our own test continues to be useful & testing its intended purpose +test(754.07, { a = 1:10; .Internal(inspect(a)); b = as.character(a); .Internal(inspect(b))}, output = "\\bcompact\\b.*\\bdeferred string conversion\\b") +test(754.08, DT[,a:=as.character(a),verbose=TRUE], output="RHS for item 1 has been duplicated") # Executing the code inside of test expands the ALTREP so we repeat the code # in order to check the result after a further assignment DT = data.table(a=1:3) DT[,b:=as.character(a)] DT[, a:=5L] -test(754.08, DT, data.table(a=5L, b=as.character(1:3))) +test(754.09, DT, data.table(a=5L, b=as.character(1:3))) # This function returns an ALTREP wrapper if the input is at least length 64 testFun = function(x) { x[FALSE] = 1 x } DT = data.table(id=1:64, col1=0, col2=0) -test(754.09, DT[, col1 := testFun(col2), verbose = TRUE], output="RHS for item 1 has been duplicated") +test(754.10, DT[, col1 := testFun(col2), verbose = TRUE], output="RHS for item 1 has been duplicated") DT = data.table(id=1:64, col1=0, col2=0) DT[, col1 := testFun(col2)] DT[, col2 := 999] -test(754.10, DT, data.table(id=1:64, col1=0, col2=999)) +test(754.11, DT, data.table(id=1:64, col1=0, col2=999)) rm(testFun) # Used to test warning on redundant by (#2282) but by=.EACHI has now superseded From aab8714d65279fbb233867be75b7f6600878465c Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 27 Jun 2025 17:44:35 +0000 Subject: [PATCH 5/7] pass for stlye --- inst/tests/tests.Rraw | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 62db45c999..dcaa55bce5 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -2285,21 +2285,21 @@ test(754.01, DT[,c("x1","y1","x2"):=list(x,y)], error="Supplied 3 columns to be test(754.02, DT[,c("x1","y1","x2"):=list(x,y,x)], data.table(a=letters[3:1],x=1:3,y=4:6,x1=1:3,y1=4:6,x2=1:3)) # And non-recycling i.e. that a single column copy does copy the column DT = data.table(a=1:3) -test(754.03, DT[,b:=a][1,a:=4L][2,b:=5L], data.table(a=INT(4,2,3),b=INT(1,5,3))) -test(754.04, DT[,b:=a][3,b:=6L], data.table(a=INT(4,2,3),b=INT(4,2,6))) -test(754.05, DT[,a:=as.numeric(a),verbose=TRUE], output="Direct plonk.*no copy") +test(754.03, DT[, b := a][1, a := 4L][2, b := 5L], data.table(a=INT(4,2,3),b=INT(1,5,3))) +test(754.04, DT[, b := a][3, b := 6L], data.table(a=INT(4,2,3),b=INT(4,2,6))) +test(754.05, DT[, a := as.numeric(a), verbose=TRUE], output="Direct plonk.*no copy") RHS = as.integer(DT$a) -test(754.06, DT[,a:=RHS,verbose=TRUE], output="RHS for item 1 has been duplicated") +test(754.06, DT[, a:= RHS, verbose=TRUE], output="RHS for item 1 has been duplicated") # Expand ALTREPS in assign.c, #5400 # String conversion gets deferred ## first, a regression test of R itself -- we want to make sure our own test continues to be useful & testing its intended purpose -test(754.07, { a = 1:10; .Internal(inspect(a)); b = as.character(a); .Internal(inspect(b))}, output = "\\bcompact\\b.*\\bdeferred string conversion\\b") -test(754.08, DT[,a:=as.character(a),verbose=TRUE], output="RHS for item 1 has been duplicated") +test(754.07, {a = 1:10; .Internal(inspect(a)); b = as.character(a); .Internal(inspect(b))}, output = "\\bcompact\\b.*\\bdeferred string conversion\\b") +test(754.08, DT[, a := as.character(a), verbose=TRUE], output="RHS for item 1 has been duplicated") # Executing the code inside of test expands the ALTREP so we repeat the code # in order to check the result after a further assignment DT = data.table(a=1:3) -DT[,b:=as.character(a)] -DT[, a:=5L] +DT[, b := as.character(a)] +DT[, a := 5L] test(754.09, DT, data.table(a=5L, b=as.character(1:3))) # This function returns an ALTREP wrapper if the input is at least length 64 testFun = function(x) { From 6777191da89561da04e73f4edcd58d424f87c0cf Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 27 Jun 2025 17:46:59 +0000 Subject: [PATCH 6/7] comment --- src/assign.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/assign.c b/src/assign.c index 63b0d4f52b..b7e13aa4fe 100644 --- a/src/assign.c +++ b/src/assign.c @@ -551,7 +551,7 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) if ( MAYBE_SHARED(thisvalue) || // set() protects the NAMED of atomic vectors from .Call setting arguments to 2 by wrapping with list (TYPEOF(values)==VECSXP && i>LENGTH(values)-1) || // recycled RHS would have columns pointing to others, #185. (TYPEOF(values)!=VECSXP && i>0) || // assigning the same values to a second column. Have to ensure a copy #2540 - ALTREP(thisvalue) + ALTREP(thisvalue) // Some ALTREP wrappers have survived to here, e.g. #5400 ) { if (verbose) { Rprintf(_("RHS for item %d has been duplicated because MAYBE_REFERENCED==%d MAYBE_SHARED==%d ALTREP==%d, but then is being plonked. length(values)==%d; length(cols)==%d\n"), From 6d257f514f325c0e2864bb820f7313c1a42556ce Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 27 Jun 2025 17:51:00 +0000 Subject: [PATCH 7/7] clarify we think this is rare given the lack of related issues --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index c19e5fa1bb..093be68697 100644 --- a/NEWS.md +++ b/NEWS.md @@ -44,7 +44,7 @@ 12. Internal functions used to signal errors are now marked as non-returning, silencing a compiler warning about potentially unchecked allocation failure. Thanks to Prof. Brian D. Ripley for the report and @aitap for the fix, [#7070](https://github.com/Rdatatable/data.table/pull/7070). -13. `data.table` did not always expand ALTREP columns when assigning a full column by reference. This could result in the target column getting modified unintentionally if the next call to the data.table was a modification by reference of the source column. E.g. in `DT[, b := as.character(a)]` the string conversion gets deferred and subsequent modification of column `a` would also modify column `b`, [#5400](https://github.com/Rdatatable/data.table/issues/5400). Thanks to @aquasync for the report and Václav Tlapák for the PR. +13. In rare cases, `data.table` failed to expand ALTREP columns when assigning a full column by reference. This could result in the target column getting modified unintentionally if the next call to the data.table was a modification by reference of the source column. E.g. in `DT[, b := as.character(a)]` the string conversion gets deferred and subsequent modification of column `a` would also modify column `b`, [#5400](https://github.com/Rdatatable/data.table/issues/5400). Thanks to @aquasync for the report and Václav Tlapák for the PR. ### NOTES